diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 186 | 
1 files changed, 150 insertions, 36 deletions
| diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 999fb0aef8f1..827bb02a43a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -994,23 +994,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)  #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)  static void destroy_compound_gigantic_page(struct page *page, -					unsigned long order) +					unsigned int order)  {  	int i;  	int nr_pages = 1 << order;  	struct page *p = page + 1;  	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { -		__ClearPageTail(p); +		clear_compound_head(p);  		set_page_refcounted(p); -		p->first_page = NULL;  	}  	set_compound_order(page, 0);  	__ClearPageHead(page);  } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order)  {  	free_contig_range(page_to_pfn(page), 1 << order);  } @@ -1054,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,  	return zone_spans_pfn(zone, last_pfn);  } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order)  {  	unsigned long nr_pages = 1 << order;  	unsigned long ret, pfn, flags; @@ -1090,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order)  }  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order);  static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)  { @@ -1123,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h,  static inline bool gigantic_page_supported(void) { return true; }  #else  static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { }  static inline void destroy_compound_gigantic_page(struct page *page, -						unsigned long order) { } +						unsigned int order) { }  static inline int alloc_fresh_gigantic_page(struct hstate *h,  					nodemask_t *nodes_allowed) { return 0; }  #endif @@ -1146,7 +1145,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)  				1 << PG_writeback);  	}  	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); -	set_compound_page_dtor(page, NULL); +	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);  	set_page_refcounted(page);  	if (hstate_is_gigantic(h)) {  		destroy_compound_gigantic_page(page, huge_page_order(h)); @@ -1242,7 +1241,7 @@ void free_huge_page(struct page *page)  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)  {  	INIT_LIST_HEAD(&page->lru); -	set_compound_page_dtor(page, free_huge_page); +	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);  	spin_lock(&hugetlb_lock);  	set_hugetlb_cgroup(page, NULL);  	h->nr_huge_pages++; @@ -1251,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)  	put_page(page); /* free it into the hugepage allocator */  } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order)  {  	int i;  	int nr_pages = 1 << order; @@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)  		 */  		__ClearPageReserved(p);  		set_page_count(p, 0); -		p->first_page = page; -		/* Make sure p->first_page is always valid for PageTail() */ -		smp_wmb(); -		__SetPageTail(p); +		set_compound_head(p, page);  	}  } @@ -1294,7 +1290,7 @@ int PageHuge(struct page *page)  		return 0;  	page = compound_head(page); -	return get_compound_page_dtor(page) == free_huge_page; +	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;  }  EXPORT_SYMBOL_GPL(PageHuge); @@ -1437,7 +1433,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)  		dissolve_free_huge_page(pfn_to_page(pfn));  } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* + * There are 3 ways this can get called: + * 1. With vma+addr: we use the VMA's memory policy + * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge + *    page from any node, and let the buddy allocator itself figure + *    it out. + * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page + *    strictly from 'nid' + */ +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, +		struct vm_area_struct *vma, unsigned long addr, int nid) +{ +	int order = huge_page_order(h); +	gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; +	unsigned int cpuset_mems_cookie; + +	/* +	 * We need a VMA to get a memory policy.  If we do not +	 * have one, we use the 'nid' argument. +	 * +	 * The mempolicy stuff below has some non-inlined bits +	 * and calls ->vm_ops.  That makes it hard to optimize at +	 * compile-time, even when NUMA is off and it does +	 * nothing.  This helps the compiler optimize it out. +	 */ +	if (!IS_ENABLED(CONFIG_NUMA) || !vma) { +		/* +		 * If a specific node is requested, make sure to +		 * get memory from there, but only when a node +		 * is explicitly specified. +		 */ +		if (nid != NUMA_NO_NODE) +			gfp |= __GFP_THISNODE; +		/* +		 * Make sure to call something that can handle +		 * nid=NUMA_NO_NODE +		 */ +		return alloc_pages_node(nid, gfp, order); +	} + +	/* +	 * OK, so we have a VMA.  Fetch the mempolicy and try to +	 * allocate a huge page with it.  We will only reach this +	 * when CONFIG_NUMA=y. +	 */ +	do { +		struct page *page; +		struct mempolicy *mpol; +		struct zonelist *zl; +		nodemask_t *nodemask; + +		cpuset_mems_cookie = read_mems_allowed_begin(); +		zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); +		mpol_cond_put(mpol); +		page = __alloc_pages_nodemask(gfp, order, zl, nodemask); +		if (page) +			return page; +	} while (read_mems_allowed_retry(cpuset_mems_cookie)); + +	return NULL; +} + +/* + * There are two ways to allocate a huge page: + * 1. When you have a VMA and an address (like a fault) + * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) + * + * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in + * this case which signifies that the allocation should be done with + * respect for the VMA's memory policy. + * + * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This + * implies that memory policies will not be taken in to account. + */ +static struct page *__alloc_buddy_huge_page(struct hstate *h, +		struct vm_area_struct *vma, unsigned long addr, int nid)  {  	struct page *page;  	unsigned int r_nid; @@ -1446,6 +1517,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)  		return NULL;  	/* +	 * Make sure that anyone specifying 'nid' is not also specifying a VMA. +	 * This makes sure the caller is picking _one_ of the modes with which +	 * we can call this function, not both. +	 */ +	if (vma || (addr != -1)) { +		VM_WARN_ON_ONCE(addr == -1); +		VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); +	} +	/*  	 * Assume we will successfully allocate the surplus page to  	 * prevent racing processes from causing the surplus to exceed  	 * overcommit @@ -1478,20 +1558,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)  	}  	spin_unlock(&hugetlb_lock); -	if (nid == NUMA_NO_NODE) -		page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| -				   __GFP_REPEAT|__GFP_NOWARN, -				   huge_page_order(h)); -	else -		page = __alloc_pages_node(nid, -			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| -			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); +	page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);  	spin_lock(&hugetlb_lock);  	if (page) {  		INIT_LIST_HEAD(&page->lru);  		r_nid = page_to_nid(page); -		set_compound_page_dtor(page, free_huge_page); +		set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);  		set_hugetlb_cgroup(page, NULL);  		/*  		 * We incremented the global counters already @@ -1510,6 +1583,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)  }  /* + * Allocate a huge page from 'nid'.  Note, 'nid' may be + * NUMA_NO_NODE, which means that it may be allocated + * anywhere. + */ +static +struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) +{ +	unsigned long addr = -1; + +	return __alloc_buddy_huge_page(h, NULL, addr, nid); +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +static +struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, +		struct vm_area_struct *vma, unsigned long addr) +{ +	return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); +} + +/*   * This allocation function is useful in the context where vma is irrelevant.   * E.g. soft-offlining uses this function because it only cares physical   * address of error page. @@ -1524,7 +1620,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)  	spin_unlock(&hugetlb_lock);  	if (!page) -		page = alloc_buddy_huge_page(h, nid); +		page = __alloc_buddy_huge_page_no_mpol(h, nid);  	return page;  } @@ -1554,7 +1650,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)  retry:  	spin_unlock(&hugetlb_lock);  	for (i = 0; i < needed; i++) { -		page = alloc_buddy_huge_page(h, NUMA_NO_NODE); +		page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);  		if (!page) {  			alloc_ok = false;  			break; @@ -1787,7 +1883,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,  	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);  	if (!page) {  		spin_unlock(&hugetlb_lock); -		page = alloc_buddy_huge_page(h, NUMA_NO_NODE); +		page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);  		if (!page)  			goto out_uncharge_cgroup; @@ -1872,7 +1968,8 @@ found:  	return 1;  } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, +		unsigned int order)  {  	if (unlikely(order > (MAX_ORDER - 1)))  		prep_compound_gigantic_page(page, order); @@ -2041,7 +2138,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,  	 * First take pages out of surplus state.  Then make up the  	 * remaining difference by allocating fresh huge pages.  	 * -	 * We might race with alloc_buddy_huge_page() here and be unable +	 * We might race with __alloc_buddy_huge_page() here and be unable  	 * to convert a surplus huge page to a normal huge page. That is  	 * not critical, though, it just means the overall size of the  	 * pool might be one hugepage larger than it needs to be, but @@ -2083,7 +2180,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,  	 * By placing pages into the surplus state independent of the  	 * overcommit value, we are allowing the surplus pool size to  	 * exceed overcommit. There are few sane options here. Since -	 * alloc_buddy_huge_page() is checking the global counter, +	 * __alloc_buddy_huge_page() is checking the global counter,  	 * though, we'll note that we're not allowed to exceed surplus  	 * and won't grow the pool anywhere else. Not until one of the  	 * sysctls are changed, or the surplus pages go out of use. @@ -2376,7 +2473,7 @@ struct node_hstate {  	struct kobject		*hugepages_kobj;  	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];  }; -struct node_hstate node_hstates[MAX_NUMNODES]; +static struct node_hstate node_hstates[MAX_NUMNODES];  /*   * A subset of global hstate attributes for node devices @@ -2583,7 +2680,7 @@ static int __init hugetlb_init(void)  module_init(hugetlb_init);  /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order)  {  	struct hstate *h;  	unsigned long i; @@ -2790,6 +2887,12 @@ void hugetlb_show_meminfo(void)  				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));  } +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ +	seq_printf(m, "HugetlbPages:\t%8lu kB\n", +		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} +  /* Return the number pages of memory we physically have, in PAGE_SIZE units. */  unsigned long hugetlb_total_pages(void)  { @@ -3025,6 +3128,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  			get_page(ptepage);  			page_dup_rmap(ptepage);  			set_huge_pte_at(dst, addr, dst_pte, entry); +			hugetlb_count_add(pages_per_huge_page(h), dst);  		}  		spin_unlock(src_ptl);  		spin_unlock(dst_ptl); @@ -3105,6 +3209,7 @@ again:  		if (huge_pte_dirty(pte))  			set_page_dirty(page); +		hugetlb_count_sub(pages_per_huge_page(h), mm);  		page_remove_rmap(page);  		force_flush = !__tlb_remove_page(tlb, page);  		if (force_flush) { @@ -3202,6 +3307,14 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,  			continue;  		/* +		 * Shared VMAs have their own reserves and do not affect +		 * MAP_PRIVATE accounting but it is possible that a shared +		 * VMA is using the same page so check and skip such VMAs. +		 */ +		if (iter_vma->vm_flags & VM_MAYSHARE) +			continue; + +		/*  		 * Unmap the page from other VMAs without their own reserves.  		 * They get marked to be SIGKILLed if they fault in these  		 * areas. This is because a future no-page fault on this VMA @@ -3501,6 +3614,7 @@ retry:  				&& (vma->vm_flags & VM_SHARED)));  	set_huge_pte_at(mm, address, ptep, new_pte); +	hugetlb_count_add(pages_per_huge_page(h), mm);  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {  		/* Optimization, do the COW without a second fault */  		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); @@ -4020,8 +4134,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,  	unsigned long s_end = sbase + PUD_SIZE;  	/* Allow segments to share if only one is marked locked */ -	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; -	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; +	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; +	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;  	/*  	 * match the virtual addresses, permission and the alignment of the |