aboutsummaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c171
1 files changed, 113 insertions, 58 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06058eaa173b..b9aa1b0b38b0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -144,7 +145,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
}
}
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
@@ -182,7 +184,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
- if (spool->min_hpages != -1) { /* minimum size accounting */
+ /* minimum size accounting */
+ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
@@ -624,6 +627,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
{
return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
/*
* Return the size of the pages allocated when backing a VMA. In the majority
@@ -828,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
* Only the process that called mmap() has reserves for
* private mappings.
*/
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return true;
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Like the shared case above, a hole punch or truncate
+ * could have been performed on the private mapping.
+ * Examine the value of chg to determine if reserves
+ * actually exist or were previously consumed.
+ * Very Subtle - The value of chg comes from a previous
+ * call to vma_needs_reserves(). The reserve map for
+ * private mappings has different (opposite) semantics
+ * than that of shared mappings. vma_needs_reserves()
+ * has already taken this difference in semantics into
+ * account. Therefore, the meaning of chg is the same
+ * as in the shared case above. Code could easily be
+ * combined, but keeping it separate draws attention to
+ * subtle differences.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
return false;
}
@@ -937,9 +960,7 @@ err:
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
+ nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
@@ -1001,7 +1022,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
+#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
+ ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
+ defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1009,6 +1032,7 @@ static void destroy_compound_gigantic_page(struct page *page,
int nr_pages = 1 << order;
struct page *p = page + 1;
+ atomic_set(compound_mapcount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
@@ -1030,8 +1054,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
}
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
- unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+ unsigned long start_pfn, unsigned long nr_pages)
{
unsigned long i, end_pfn = start_pfn + nr_pages;
struct page *page;
@@ -1042,6 +1066,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
page = pfn_to_page(i);
+ if (page_zone(page) != z)
+ return false;
+
if (PageReserved(page))
return false;
@@ -1074,7 +1101,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
pfn = ALIGN(z->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -1811,6 +1838,25 @@ static long __vma_reservation_common(struct hstate *h,
if (vma->vm_flags & VM_MAYSHARE)
return ret;
+ else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
+ /*
+ * In most cases, reserves always exist for private mappings.
+ * However, a file associated with mapping could have been
+ * hole punched or truncated after reserves were consumed.
+ * As subsequent fault on such a range will not use reserves.
+ * Subtle - The reserve map for private mappings has the
+ * opposite meaning than that of shared mappings. If NO
+ * entry is in the reserve map, it means a reservation exists.
+ * If an entry exists in the reserve map, it means the
+ * reservation has already been consumed. As a result, the
+ * return value of this routine is the opposite of the
+ * value returned from reserve map manipulation routines above.
+ */
+ if (ret)
+ return 0;
+ else
+ return 1;
+ }
else
return ret < 0 ? ret : 0;
}
@@ -2170,6 +2216,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
+
+ /* yield cpu to avoid soft lockup */
+ cond_resched();
+
if (hstate_is_gigantic(h))
ret = alloc_fresh_gigantic_page(h, nodes_allowed);
else
@@ -2659,6 +2709,11 @@ static int __init hugetlb_init(void)
subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+ parsed_valid_hugepagesz = false;
+}
+
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
@@ -2678,8 +2733,8 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_alloc = first_memory_node;
+ h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
@@ -2691,11 +2746,17 @@ static int __init hugetlb_nrpages_setup(char *s)
unsigned long *mhp;
static unsigned long *last_mhp;
+ if (!parsed_valid_hugepagesz) {
+ pr_warn("hugepages = %s preceded by "
+ "an unsupported hugepagesz, ignoring\n", s);
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!hugetlb_max_hstate)
+ else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
@@ -3122,7 +3183,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
{
- int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
@@ -3141,19 +3201,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
-again:
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep))
- goto unlock;
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ spin_unlock(ptl);
+ continue;
+ }
pte = huge_ptep_get(ptep);
- if (huge_pte_none(pte))
- goto unlock;
+ if (huge_pte_none(pte)) {
+ spin_unlock(ptl);
+ continue;
+ }
/*
* Migrating hugepage or HWPoisoned hugepage is already
@@ -3161,7 +3224,8 @@ again:
*/
if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep);
- goto unlock;
+ spin_unlock(ptl);
+ continue;
}
page = pte_page(pte);
@@ -3171,9 +3235,10 @@ again:
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
- if (page != ref_page)
- goto unlock;
-
+ if (page != ref_page) {
+ spin_unlock(ptl);
+ continue;
+ }
/*
* Mark the VMA as having unmapped its page so that
* future faults in this VMA will fail rather than
@@ -3189,30 +3254,14 @@ again:
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, true);
- force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush) {
- address += sz;
- spin_unlock(ptl);
- break;
- }
- /* Bail out after unmapping reference page if supplied */
- if (ref_page) {
- spin_unlock(ptl);
- break;
- }
-unlock:
+
spin_unlock(ptl);
- }
- /*
- * mmu_gather ran out of room to batch pages, we break out of
- * the PTE lock to avoid doing the potential expensive TLB invalidate
- * and page-free while holding it.
- */
- if (force_flush) {
- force_flush = 0;
- tlb_flush_mmu(tlb);
- if (address < end && !ref_page)
- goto again;
+ tlb_remove_page_size(tlb, page, huge_page_size(h));
+ /*
+ * Bail out after unmapping reference page if supplied
+ */
+ if (ref_page)
+ break;
}
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
tlb_end_vma(tlb, vma);
@@ -3271,7 +3320,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address = address & huge_page_mask(h);
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
- mapping = file_inode(vma->vm_file)->i_mapping;
+ mapping = vma->vm_file->f_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
@@ -3328,7 +3377,7 @@ retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- page_move_anon_rmap(old_page, vma, address);
+ page_move_anon_rmap(old_page, vma);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -3346,7 +3395,7 @@ retry_avoidcopy:
old_page != pagecache_page)
outside_reserve = 1;
- page_cache_get(old_page);
+ get_page(old_page);
/*
* Drop page table lock as buddy allocator may be called. It will
@@ -3364,7 +3413,7 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
- page_cache_release(old_page);
+ put_page(old_page);
BUG_ON(huge_pte_none(pte));
unmap_ref_private(mm, vma, old_page, address);
BUG_ON(huge_pte_none(pte));
@@ -3425,9 +3474,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- page_cache_release(new_page);
+ put_page(new_page);
out_release_old:
- page_cache_release(old_page);
+ put_page(old_page);
spin_lock(ptl); /* Caller expects lock to be held */
return ret;
@@ -3893,6 +3942,14 @@ same_page:
return i ? i : -EFAULT;
}
+#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
+/*
+ * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
+ * implement this.
+ */
+#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
+#endif
+
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
@@ -3953,7 +4010,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk.
*/
- flush_tlb_range(vma, start, end);
+ flush_hugetlb_tlb_range(vma, start, end);
mmu_notifier_invalidate_range(mm, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
@@ -4174,7 +4231,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
- mm_inc_nr_pmds(mm);
get_page(virt_to_page(spte));
break;
}
@@ -4189,9 +4245,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
+ mm_inc_nr_pmds(mm);
} else {
put_page(virt_to_page(spte));
- mm_inc_nr_pmds(mm);
}
spin_unlock(ptl);
out:
@@ -4262,7 +4318,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+ BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
return pte;
}
@@ -4347,7 +4403,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
/*
* This function is called from memory failure code.
- * Assume the caller holds page lock of the head page.
*/
int dequeue_hwpoisoned_huge_page(struct page *hpage)
{