diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 127 | 
1 files changed, 82 insertions, 45 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..7d57af21f49e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)  	BUG_ON(page_count(page));  	BUG_ON(page_mapcount(page));  	restore_reserve = PagePrivate(page); +	ClearPagePrivate(page);  	spin_lock(&hugetlb_lock);  	hugetlb_cgroup_uncharge_page(hstate_index(h), @@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)  	/* we rely on prep_new_huge_page to set the destructor */  	set_compound_order(page, order);  	__SetPageHead(page); +	__ClearPageReserved(page);  	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {  		__SetPageTail(p); +		/* +		 * For gigantic hugepages allocated through bootmem at +		 * boot, it's safer to be consistent with the not-gigantic +		 * hugepages and clear the PG_reserved bit from all tail pages +		 * too.  Otherwse drivers using get_user_pages() to access tail +		 * pages may get the reference counting wrong if they see +		 * PG_reserved set on a tail page (despite the head page not +		 * having PG_reserved set).  Enforcing this consistency between +		 * head and tail pages allows drivers to optimize away a check +		 * on the head page when they need know if put_page() is needed +		 * after get_user_pages(). +		 */ +		__ClearPageReserved(p);  		set_page_count(p, 0);  		p->first_page = page;  	} @@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)  #else  		page = virt_to_page(m);  #endif -		__ClearPageReserved(page);  		WARN_ON(page_count(page) != 1);  		prep_compound_huge_page(page, h->order); +		WARN_ON(PageReserved(page));  		prep_new_huge_page(h, page, page_to_nid(page));  		/*  		 * If we had gigantic hugepages allocated at boot time, we need @@ -2361,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;  	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { +		spinlock_t *src_ptl, *dst_ptl;  		src_pte = huge_pte_offset(src, addr);  		if (!src_pte)  			continue; @@ -2372,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  		if (dst_pte == src_pte)  			continue; -		spin_lock(&dst->page_table_lock); -		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); +		dst_ptl = huge_pte_lock(h, dst, dst_pte); +		src_ptl = huge_pte_lockptr(h, src, src_pte); +		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);  		if (!huge_pte_none(huge_ptep_get(src_pte))) {  			if (cow)  				huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2383,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,  			page_dup_rmap(ptepage);  			set_huge_pte_at(dst, addr, dst_pte, entry);  		} -		spin_unlock(&src->page_table_lock); -		spin_unlock(&dst->page_table_lock); +		spin_unlock(src_ptl); +		spin_unlock(dst_ptl);  	}  	return 0; @@ -2427,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  	unsigned long address;  	pte_t *ptep;  	pte_t pte; +	spinlock_t *ptl;  	struct page *page;  	struct hstate *h = hstate_vma(vma);  	unsigned long sz = huge_page_size(h); @@ -2440,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,  	tlb_start_vma(tlb, vma);  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  again: -	spin_lock(&mm->page_table_lock);  	for (address = start; address < end; address += sz) {  		ptep = huge_pte_offset(mm, address);  		if (!ptep)  			continue; +		ptl = huge_pte_lock(h, mm, ptep);  		if (huge_pmd_unshare(mm, &address, ptep)) -			continue; +			goto unlock;  		pte = huge_ptep_get(ptep);  		if (huge_pte_none(pte)) -			continue; +			goto unlock;  		/*  		 * HWPoisoned hugepage is already unmapped and dropped reference  		 */  		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {  			huge_pte_clear(mm, address, ptep); -			continue; +			goto unlock;  		}  		page = pte_page(pte); @@ -2469,7 +2487,7 @@ again:  		 */  		if (ref_page) {  			if (page != ref_page) -				continue; +				goto unlock;  			/*  			 * Mark the VMA as having unmapped its page so that @@ -2486,13 +2504,18 @@ again:  		page_remove_rmap(page);  		force_flush = !__tlb_remove_page(tlb, page); -		if (force_flush) +		if (force_flush) { +			spin_unlock(ptl);  			break; +		}  		/* Bail out after unmapping reference page if supplied */ -		if (ref_page) +		if (ref_page) { +			spin_unlock(ptl);  			break; +		} +unlock: +		spin_unlock(ptl);  	} -	spin_unlock(&mm->page_table_lock);  	/*  	 * mmu_gather ran out of room to batch pages, we break out of  	 * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2598,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,   */  static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,  			unsigned long address, pte_t *ptep, pte_t pte, -			struct page *pagecache_page) +			struct page *pagecache_page, spinlock_t *ptl)  {  	struct hstate *h = hstate_vma(vma);  	struct page *old_page, *new_page; @@ -2632,8 +2655,8 @@ retry_avoidcopy:  	page_cache_get(old_page); -	/* Drop page_table_lock as buddy allocator may be called */ -	spin_unlock(&mm->page_table_lock); +	/* Drop page table lock as buddy allocator may be called */ +	spin_unlock(ptl);  	new_page = alloc_huge_page(vma, address, outside_reserve);  	if (IS_ERR(new_page)) { @@ -2651,13 +2674,13 @@ retry_avoidcopy:  			BUG_ON(huge_pte_none(pte));  			if (unmap_ref_private(mm, vma, old_page, address)) {  				BUG_ON(huge_pte_none(pte)); -				spin_lock(&mm->page_table_lock); +				spin_lock(ptl);  				ptep = huge_pte_offset(mm, address & huge_page_mask(h));  				if (likely(pte_same(huge_ptep_get(ptep), pte)))  					goto retry_avoidcopy;  				/* -				 * race occurs while re-acquiring page_table_lock, and -				 * our job is done. +				 * race occurs while re-acquiring page table +				 * lock, and our job is done.  				 */  				return 0;  			} @@ -2665,7 +2688,7 @@ retry_avoidcopy:  		}  		/* Caller expects lock to be held */ -		spin_lock(&mm->page_table_lock); +		spin_lock(ptl);  		if (err == -ENOMEM)  			return VM_FAULT_OOM;  		else @@ -2680,7 +2703,7 @@ retry_avoidcopy:  		page_cache_release(new_page);  		page_cache_release(old_page);  		/* Caller expects lock to be held */ -		spin_lock(&mm->page_table_lock); +		spin_lock(ptl);  		return VM_FAULT_OOM;  	} @@ -2692,10 +2715,10 @@ retry_avoidcopy:  	mmun_end = mmun_start + huge_page_size(h);  	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  	/* -	 * Retake the page_table_lock to check for racing updates +	 * Retake the page table lock to check for racing updates  	 * before the page tables are altered  	 */ -	spin_lock(&mm->page_table_lock); +	spin_lock(ptl);  	ptep = huge_pte_offset(mm, address & huge_page_mask(h));  	if (likely(pte_same(huge_ptep_get(ptep), pte))) {  		ClearPagePrivate(new_page); @@ -2709,13 +2732,13 @@ retry_avoidcopy:  		/* Make the old page be freed below */  		new_page = old_page;  	} -	spin_unlock(&mm->page_table_lock); +	spin_unlock(ptl);  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  	page_cache_release(new_page);  	page_cache_release(old_page);  	/* Caller expects lock to be held */ -	spin_lock(&mm->page_table_lock); +	spin_lock(ptl);  	return 0;  } @@ -2763,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,  	struct page *page;  	struct address_space *mapping;  	pte_t new_pte; +	spinlock_t *ptl;  	/*  	 * Currently, we are forced to kill the process in the event the @@ -2849,7 +2873,8 @@ retry:  			goto backout_unlocked;  		} -	spin_lock(&mm->page_table_lock); +	ptl = huge_pte_lockptr(h, mm, ptep); +	spin_lock(ptl);  	size = i_size_read(mapping->host) >> huge_page_shift(h);  	if (idx >= size)  		goto backout; @@ -2870,16 +2895,16 @@ retry:  	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {  		/* Optimization, do the COW without a second fault */ -		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); +		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);  	} -	spin_unlock(&mm->page_table_lock); +	spin_unlock(ptl);  	unlock_page(page);  out:  	return ret;  backout: -	spin_unlock(&mm->page_table_lock); +	spin_unlock(ptl);  backout_unlocked:  	unlock_page(page);  	put_page(page); @@ -2891,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  {  	pte_t *ptep;  	pte_t entry; +	spinlock_t *ptl;  	int ret;  	struct page *page = NULL;  	struct page *pagecache_page = NULL; @@ -2903,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	if (ptep) {  		entry = huge_ptep_get(ptep);  		if (unlikely(is_hugetlb_entry_migration(entry))) { -			migration_entry_wait_huge(mm, ptep); +			migration_entry_wait_huge(vma, mm, ptep);  			return 0;  		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))  			return VM_FAULT_HWPOISON_LARGE | @@ -2959,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	if (page != pagecache_page)  		lock_page(page); -	spin_lock(&mm->page_table_lock); +	ptl = huge_pte_lockptr(h, mm, ptep); +	spin_lock(ptl);  	/* Check for a racing update before calling hugetlb_cow */  	if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) -		goto out_page_table_lock; +		goto out_ptl;  	if (flags & FAULT_FLAG_WRITE) {  		if (!huge_pte_write(entry)) {  			ret = hugetlb_cow(mm, vma, address, ptep, entry, -							pagecache_page); -			goto out_page_table_lock; +					pagecache_page, ptl); +			goto out_ptl;  		}  		entry = huge_pte_mkdirty(entry);  	} @@ -2978,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  						flags & FAULT_FLAG_WRITE))  		update_mmu_cache(vma, address, ptep); -out_page_table_lock: -	spin_unlock(&mm->page_table_lock); +out_ptl: +	spin_unlock(ptl);  	if (pagecache_page) {  		unlock_page(pagecache_page); @@ -3005,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  	unsigned long remainder = *nr_pages;  	struct hstate *h = hstate_vma(vma); -	spin_lock(&mm->page_table_lock);  	while (vaddr < vma->vm_end && remainder) {  		pte_t *pte; +		spinlock_t *ptl = NULL;  		int absent;  		struct page *page; @@ -3015,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  		 * Some archs (sparc64, sh*) have multiple pte_ts to  		 * each hugepage.  We have to make sure we get the  		 * first, for the page indexing below to work. +		 * +		 * Note that page table lock is not held when pte is null.  		 */  		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); +		if (pte) +			ptl = huge_pte_lock(h, mm, pte);  		absent = !pte || huge_pte_none(huge_ptep_get(pte));  		/* @@ -3028,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  		 */  		if (absent && (flags & FOLL_DUMP) &&  		    !hugetlbfs_pagecache_present(h, vma, vaddr)) { +			if (pte) +				spin_unlock(ptl);  			remainder = 0;  			break;  		} @@ -3047,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,  		      !huge_pte_write(huge_ptep_get(pte)))) {  			int ret; -			spin_unlock(&mm->page_table_lock); +			if (pte) +				spin_unlock(ptl);  			ret = hugetlb_fault(mm, vma, vaddr,  				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); -			spin_lock(&mm->page_table_lock);  			if (!(ret & VM_FAULT_ERROR))  				continue; @@ -3081,8 +3114,8 @@ same_page:  			 */  			goto same_page;  		} +		spin_unlock(ptl);  	} -	spin_unlock(&mm->page_table_lock);  	*nr_pages = remainder;  	*position = vaddr; @@ -3103,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  	flush_cache_range(vma, address, end);  	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); -	spin_lock(&mm->page_table_lock);  	for (; address < end; address += huge_page_size(h)) { +		spinlock_t *ptl;  		ptep = huge_pte_offset(mm, address);  		if (!ptep)  			continue; +		ptl = huge_pte_lock(h, mm, ptep);  		if (huge_pmd_unshare(mm, &address, ptep)) {  			pages++; +			spin_unlock(ptl);  			continue;  		}  		if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3119,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,  			set_huge_pte_at(mm, address, ptep, pte);  			pages++;  		} +		spin_unlock(ptl);  	} -	spin_unlock(&mm->page_table_lock);  	/*  	 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare  	 * may have cleared our pud entry and done put_page on the page table: @@ -3283,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	unsigned long saddr;  	pte_t *spte = NULL;  	pte_t *pte; +	spinlock_t *ptl;  	if (!vma_shareable(vma, addr))  		return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3305,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)  	if (!spte)  		goto out; -	spin_lock(&mm->page_table_lock); +	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); +	spin_lock(ptl);  	if (pud_none(*pud))  		pud_populate(mm, pud,  				(pmd_t *)((unsigned long)spte & PAGE_MASK));  	else  		put_page(virt_to_page(spte)); -	spin_unlock(&mm->page_table_lock); +	spin_unlock(ptl);  out:  	pte = (pte_t *)pmd_alloc(mm, pud, addr);  	mutex_unlock(&mapping->i_mmap_mutex); @@ -3325,7 +3362,7 @@ out:   * indicated by page_count > 1, unmap is achieved by clearing pud and   * decrementing the ref count. If count == 1, the pte page is not shared.   * - * called with vma->vm_mm->page_table_lock held. + * called with page table lock held.   *   * returns: 1 successfully unmapped a shared pte page   *	    0 the underlying pte page is not shared, or it is the last user  |