diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 195 | 
1 files changed, 136 insertions, 59 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,17 +24,20 @@  #include <asm/page.h>  #include <asm/pgtable.h> -#include <linux/io.h> +#include <asm/tlb.h> +#include <linux/io.h>  #include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h>  #include <linux/node.h> +#include <linux/hugetlb_cgroup.h>  #include "internal.h"  const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;  static gfp_t htlb_alloc_mask = GFP_HIGHUSER;  unsigned long hugepages_treat_as_movable; -static int max_hstate; +int hugetlb_max_hstate __read_mostly;  unsigned int default_hstate_idx;  struct hstate hstates[HUGE_MAX_HSTATE]; @@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;  static unsigned long __initdata default_hstate_max_huge_pages;  static unsigned long __initdata default_hstate_size; -#define for_each_hstate(h) \ -	for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) -  /*   * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages   */ -static DEFINE_SPINLOCK(hugetlb_lock); +DEFINE_SPINLOCK(hugetlb_lock);  static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)  { @@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)  static void enqueue_huge_page(struct hstate *h, struct page *page)  {  	int nid = page_to_nid(page); -	list_add(&page->lru, &h->hugepage_freelists[nid]); +	list_move(&page->lru, &h->hugepage_freelists[nid]);  	h->free_huge_pages++;  	h->free_huge_pages_node[nid]++;  } @@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)  	if (list_empty(&h->hugepage_freelists[nid]))  		return NULL;  	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); -	list_del(&page->lru); +	list_move(&page->lru, &h->hugepage_activelist);  	set_page_refcounted(page);  	h->free_huge_pages--;  	h->free_huge_pages_node[nid]--; @@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)  				1 << PG_active | 1 << PG_reserved |  				1 << PG_private | 1 << PG_writeback);  	} +	VM_BUG_ON(hugetlb_cgroup_from_page(page));  	set_compound_page_dtor(page, NULL);  	set_page_refcounted(page);  	arch_release_hugepage(page); @@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)  	page->mapping = NULL;  	BUG_ON(page_count(page));  	BUG_ON(page_mapcount(page)); -	INIT_LIST_HEAD(&page->lru);  	spin_lock(&hugetlb_lock); +	hugetlb_cgroup_uncharge_page(hstate_index(h), +				     pages_per_huge_page(h), page);  	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { +		/* remove the page from active list */ +		list_del(&page->lru);  		update_and_free_page(h, page);  		h->surplus_huge_pages--;  		h->surplus_huge_pages_node[nid]--; @@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)  { +	INIT_LIST_HEAD(&page->lru);  	set_compound_page_dtor(page, free_huge_page);  	spin_lock(&hugetlb_lock); +	set_hugetlb_cgroup(page, NULL);  	h->nr_huge_pages++;  	h->nr_huge_pages_node[nid]++;  	spin_unlock(&hugetlb_lock); @@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)  	spin_lock(&hugetlb_lock);  	if (page) { +		INIT_LIST_HEAD(&page->lru);  		r_nid = page_to_nid(page);  		set_compound_page_dtor(page, free_huge_page); +		set_hugetlb_cgroup(page, NULL);  		/*  		 * We incremented the global counters already  		 */ @@ -993,7 +1001,6 @@ retry:  	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {  		if ((--needed) < 0)  			break; -		list_del(&page->lru);  		/*  		 * This page is now managed by the hugetlb allocator and has  		 * no users -- drop the buddy allocator's reference. @@ -1008,7 +1015,6 @@ free:  	/* Free unnecessary surplus pages to the buddy allocator */  	if (!list_empty(&surplus_list)) {  		list_for_each_entry_safe(page, tmp, &surplus_list, lru) { -			list_del(&page->lru);  			put_page(page);  		}  	} @@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,  	struct hstate *h = hstate_vma(vma);  	struct page *page;  	long chg; +	int ret, idx; +	struct hugetlb_cgroup *h_cg; +	idx = hstate_index(h);  	/*  	 * Processes that did not create the mapping will have no  	 * reserves and will not have accounted against subpool @@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,  	 */  	chg = vma_needs_reservation(h, vma, addr);  	if (chg < 0) -		return ERR_PTR(-VM_FAULT_OOM); +		return ERR_PTR(-ENOMEM);  	if (chg)  		if (hugepage_subpool_get_pages(spool, chg)) -			return ERR_PTR(-VM_FAULT_SIGBUS); +			return ERR_PTR(-ENOSPC); +	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); +	if (ret) { +		hugepage_subpool_put_pages(spool, chg); +		return ERR_PTR(-ENOSPC); +	}  	spin_lock(&hugetlb_lock);  	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); -	spin_unlock(&hugetlb_lock); - -	if (!page) { +	if (page) { +		/* update page cgroup details */ +		hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), +					     h_cg, page); +		spin_unlock(&hugetlb_lock); +	} else { +		spin_unlock(&hugetlb_lock);  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);  		if (!page) { +			hugetlb_cgroup_uncharge_cgroup(idx, +						       pages_per_huge_page(h), +						       h_cg);  			hugepage_subpool_put_pages(spool, chg); -			return ERR_PTR(-VM_FAULT_SIGBUS); +			return ERR_PTR(-ENOSPC);  		} +		spin_lock(&hugetlb_lock); +		hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), +					     h_cg, page); +		list_move(&page->lru, &h->hugepage_activelist); +		spin_unlock(&hugetlb_lock);  	}  	set_page_private(page, (unsigned long)spool);  	vma_commit_reservation(h, vma, addr); -  	return page;  } @@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,  				    struct attribute_group *hstate_attr_group)  {  	int retval; -	int hi = h - hstates; +	int hi = hstate_index(h);  	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);  	if (!hstate_kobjs[hi]) @@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)  	if (!nhs->hugepages_kobj)  		return;		/* no hstate attributes */ -	for_each_hstate(h) -		if (nhs->hstate_kobjs[h - hstates]) { -			kobject_put(nhs->hstate_kobjs[h - hstates]); -			nhs->hstate_kobjs[h - hstates] = NULL; +	for_each_hstate(h) { +		int idx = hstate_index(h); +		if (nhs->hstate_kobjs[idx]) { +			kobject_put(nhs->hstate_kobjs[idx]); +			nhs->hstate_kobjs[idx] = NULL;  		} +	}  	kobject_put(nhs->hugepages_kobj);  	nhs->hugepages_kobj = NULL; @@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)  	hugetlb_unregister_all_nodes();  	for_each_hstate(h) { -		kobject_put(hstate_kobjs[h - hstates]); +		kobject_put(hstate_kobjs[hstate_index(h)]);  	}  	kobject_put(hugepages_kobj); @@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)  		if (!size_to_hstate(default_hstate_size))  			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);  	} -	default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; +	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));  	if (default_hstate_max_huge_pages)  		default_hstate.max_huge_pages = default_hstate_max_huge_pages; @@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)  		printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");  		return;  	} -	BUG_ON(max_hstate >= HUGE_MAX_HSTATE); +	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);  	BUG_ON(order == 0); -	h = &hstates[max_hstate++]; +	h = &hstates[hugetlb_max_hstate++];  	h->order = order;  	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);  	h->nr_huge_pages = 0;  	h->free_huge_pages = 0;  	for (i = 0; i < MAX_NUMNODES; ++i)  		INIT_LIST_HEAD(&h->hugepage_freelists[i]); +	INIT_LIST_HEAD(&h->hugepage_activelist);  	h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);  	h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);  	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",  					huge_page_size(h)/1024); +	/* +	 * Add cgroup control files only if the huge page consists +	 * of more than two normal pages. This is because we use +	 * page[2].lru.next for storing cgoup details. +	 */ +	if (order >= HUGETLB_CGROUP_MIN_ORDER) +		hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);  	parsed_hstate = h;  } @@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)  	static unsigned long *last_mhp;  	/* -	 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, +	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,  	 * so this hugepages= parameter goes to the "default hstate".  	 */ -	if (!max_hstate) +	if (!hugetlb_max_hstate)  		mhp = &default_hstate_max_huge_pages;  	else  		mhp = &parsed_hstate->max_huge_pages; @@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)  	 * But we need to allocate >= MAX_ORDER hstates here early to still  	 * use the bootmem allocator.  	 */ -	if (max_hstate && parsed_hstate->order >= MAX_ORDER) +	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)  		hugetlb_hstate_alloc_pages(parsed_hstate);  	last_mhp = mhp; @@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)  		return 0;  } -void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, -			    unsigned long end, struct page *ref_page) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, +			    unsigned long start, unsigned long end, +			    struct page *ref_page)  { +	int force_flush = 0;  	struct mm_struct *mm = vma->vm_mm;  	unsigned long address;  	pte_t *ptep;  	pte_t pte;  	struct page *page; -	struct page *tmp;  	struct hstate *h = hstate_vma(vma);  	unsigned long sz = huge_page_size(h); -	/* -	 * A page gathering list, protected by per file i_mmap_mutex. The -	 * lock is used to avoid list corruption from multiple unmapping -	 * of the same page since we are using page->lru. -	 */ -	LIST_HEAD(page_list); -  	WARN_ON(!is_vm_hugetlb_page(vma));  	BUG_ON(start & ~huge_page_mask(h));  	BUG_ON(end & ~huge_page_mask(h)); +	tlb_start_vma(tlb, vma);  	mmu_notifier_invalidate_range_start(mm, start, end); +again:  	spin_lock(&mm->page_table_lock);  	for (address = start; address < end; address += sz) {  		ptep = huge_pte_offset(mm, address); @@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,  		}  		pte = huge_ptep_get_and_clear(mm, address, ptep); +		tlb_remove_tlb_entry(tlb, ptep, address);  		if (pte_dirty(pte))  			set_page_dirty(page); -		list_add(&page->lru, &page_list); +		page_remove_rmap(page); +		force_flush = !__tlb_remove_page(tlb, page); +		if (force_flush) +			break;  		/* Bail out after unmapping reference page if supplied */  		if (ref_page)  			break;  	} -	flush_tlb_range(vma, start, end);  	spin_unlock(&mm->page_table_lock); -	mmu_notifier_invalidate_range_end(mm, start, end); -	list_for_each_entry_safe(page, tmp, &page_list, lru) { -		page_remove_rmap(page); -		list_del(&page->lru); -		put_page(page); +	/* +	 * mmu_gather ran out of room to batch pages, we break out of +	 * the PTE lock to avoid doing the potential expensive TLB invalidate +	 * and page-free while holding it. +	 */ +	if (force_flush) { +		force_flush = 0; +		tlb_flush_mmu(tlb); +		if (address < end && !ref_page) +			goto again;  	} +	mmu_notifier_invalidate_range_end(mm, start, end); +	tlb_end_vma(tlb, vma); +} + +void __unmap_hugepage_range_final(struct mmu_gather *tlb, +			  struct vm_area_struct *vma, unsigned long start, +			  unsigned long end, struct page *ref_page) +{ +	__unmap_hugepage_range(tlb, vma, start, end, ref_page); + +	/* +	 * Clear this flag so that x86's huge_pmd_share page_table_shareable +	 * test will fail on a vma being torn down, and not grab a page table +	 * on its way out.  We're lucky that the flag has such an appropriate +	 * name, and can in fact be safely cleared here. We could clear it +	 * before the __unmap_hugepage_range above, but all that's necessary +	 * is to clear it before releasing the i_mmap_mutex. This works +	 * because in the context this is called, the VMA is about to be +	 * destroyed and the i_mmap_mutex is held. +	 */ +	vma->vm_flags &= ~VM_MAYSHARE;  }  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,  			  unsigned long end, struct page *ref_page)  { -	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); -	__unmap_hugepage_range(vma, start, end, ref_page); -	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); +	struct mm_struct *mm; +	struct mmu_gather tlb; + +	mm = vma->vm_mm; + +	tlb_gather_mmu(&tlb, mm, 0); +	__unmap_hugepage_range(&tlb, vma, start, end, ref_page); +	tlb_finish_mmu(&tlb, start, end);  }  /* @@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,  		 * from the time of fork. This would look like data corruption  		 */  		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) -			__unmap_hugepage_range(iter_vma, -				address, address + huge_page_size(h), -				page); +			unmap_hugepage_range(iter_vma, address, +					     address + huge_page_size(h), page);  	}  	mutex_unlock(&mapping->i_mmap_mutex); @@ -2496,6 +2560,7 @@ retry_avoidcopy:  	new_page = alloc_huge_page(vma, address, outside_reserve);  	if (IS_ERR(new_page)) { +		long err = PTR_ERR(new_page);  		page_cache_release(old_page);  		/* @@ -2524,7 +2589,10 @@ retry_avoidcopy:  		/* Caller expects lock to be held */  		spin_lock(&mm->page_table_lock); -		return -PTR_ERR(new_page); +		if (err == -ENOMEM) +			return VM_FAULT_OOM; +		else +			return VM_FAULT_SIGBUS;  	}  	/* @@ -2642,7 +2710,11 @@ retry:  			goto out;  		page = alloc_huge_page(vma, address, 0);  		if (IS_ERR(page)) { -			ret = -PTR_ERR(page); +			ret = PTR_ERR(page); +			if (ret == -ENOMEM) +				ret = VM_FAULT_OOM; +			else +				ret = VM_FAULT_SIGBUS;  			goto out;  		}  		clear_huge_page(page, address, pages_per_huge_page(h)); @@ -2679,7 +2751,7 @@ retry:  		 */  		if (unlikely(PageHWPoison(page))) {  			ret = VM_FAULT_HWPOISON | -			      VM_FAULT_SET_HINDEX(h - hstates); +				VM_FAULT_SET_HINDEX(hstate_index(h));  			goto backout_unlocked;  		}  	} @@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			return 0;  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))  			return VM_FAULT_HWPOISON_LARGE | -			       VM_FAULT_SET_HINDEX(h - hstates); +				VM_FAULT_SET_HINDEX(hstate_index(h));  	}  	ptep = huge_pte_alloc(mm, address, huge_page_size(h)); @@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,  		}  	}  	spin_unlock(&mm->page_table_lock); -	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - +	/* +	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare +	 * may have cleared our pud entry and done put_page on the page table: +	 * once we release i_mmap_mutex, another task can do the final put_page +	 * and that page table be reused and filled with junk. +	 */  	flush_tlb_range(vma, start, end); +	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);  }  int hugetlb_reserve_pages(struct inode *inode,  |