diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 466 | 
1 files changed, 197 insertions, 269 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23ef240ba48a..6be78e7d4f6e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,  	struct page *p;  	atomic_set(&folio->_entire_mapcount, 0); -	atomic_set(&folio->_nr_pages_mapped, 0); +	atomic_set(&folio->_large_mapcount, 0);  	atomic_set(&folio->_pincount, 0);  	for (i = 1; i < nr_pages; i++) { @@ -1619,19 +1619,11 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,  						unsigned int order) { }  #endif -static inline void __clear_hugetlb_destructor(struct hstate *h, -						struct folio *folio) -{ -	lockdep_assert_held(&hugetlb_lock); - -	folio_clear_hugetlb(folio); -} -  /*   * Remove hugetlb folio from lists. - * If vmemmap exists for the folio, update dtor so that the folio appears - * as just a compound page.  Otherwise, wait until after allocating vmemmap - * to update dtor. + * If vmemmap exists for the folio, clear the hugetlb flag so that the + * folio appears as just a compound page.  Otherwise, wait until after + * allocating vmemmap to clear the flag.   *   * A reference is held on the folio, except in the case of demote.   * @@ -1662,12 +1654,12 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,  	}  	/* -	 * We can only clear the hugetlb destructor after allocating vmemmap +	 * We can only clear the hugetlb flag after allocating vmemmap  	 * pages.  Otherwise, someone (memory error handling) may try to write  	 * to tail struct pages.  	 */  	if (!folio_test_hugetlb_vmemmap_optimized(folio)) -		__clear_hugetlb_destructor(h, folio); +		__folio_clear_hugetlb(folio);  	 /*  	  * In the case of demote we do not ref count the page as it will soon @@ -1711,7 +1703,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,  		h->surplus_huge_pages_node[nid]++;  	} -	folio_set_hugetlb(folio); +	__folio_set_hugetlb(folio);  	folio_change_private(folio, NULL);  	/*  	 * We have to set hugetlb_vmemmap_optimized again as above @@ -1734,14 +1726,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,  		 */  		return; -	arch_clear_hugepage_flags(&folio->page); +	arch_clear_hugetlb_flags(folio);  	enqueue_hugetlb_folio(h, folio);  }  static void __update_and_free_hugetlb_folio(struct hstate *h,  						struct folio *folio)  { -	bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); +	bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);  	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())  		return; @@ -1754,11 +1746,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,  		return;  	/* -	 * If folio is not vmemmap optimized (!clear_dtor), then the folio +	 * If folio is not vmemmap optimized (!clear_flag), then the folio  	 * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio  	 * can only be passed hugetlb pages and will BUG otherwise.  	 */ -	if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { +	if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {  		spin_lock_irq(&hugetlb_lock);  		/*  		 * If we cannot allocate vmemmap pages, just refuse to free the @@ -1779,11 +1771,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,  	/*  	 * If vmemmap pages were allocated above, then we need to clear the -	 * hugetlb destructor under the hugetlb lock. +	 * hugetlb flag under the hugetlb lock.  	 */ -	if (clear_dtor) { +	if (folio_test_hugetlb(folio)) {  		spin_lock_irq(&hugetlb_lock); -		__clear_hugetlb_destructor(h, folio); +		__folio_clear_hugetlb(folio);  		spin_unlock_irq(&hugetlb_lock);  	} @@ -1796,7 +1788,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,  		destroy_compound_gigantic_folio(folio, huge_page_order(h));  		free_gigantic_folio(folio, huge_page_order(h));  	} else { -		__free_pages(&folio->page, huge_page_order(h)); +		INIT_LIST_HEAD(&folio->_deferred_list); +		folio_put(folio);  	}  } @@ -1884,7 +1877,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,  		list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {  			list_del(&folio->lru);  			spin_lock_irq(&hugetlb_lock); -			__clear_hugetlb_destructor(h, folio); +			__folio_clear_hugetlb(folio);  			spin_unlock_irq(&hugetlb_lock);  			update_and_free_hugetlb_folio(h, folio, false);  			cond_resched(); @@ -1909,7 +1902,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,  			} else {  				list_del(&folio->lru);  				spin_lock_irq(&hugetlb_lock); -				__clear_hugetlb_destructor(h, folio); +				__folio_clear_hugetlb(folio);  				spin_unlock_irq(&hugetlb_lock);  				update_and_free_hugetlb_folio(h, folio, false);  				cond_resched(); @@ -1942,14 +1935,14 @@ retry:  	 * should only be pages on the non_hvo_folios list.  	 * Do note that the non_hvo_folios list could be empty.  	 * Without HVO enabled, ret will be 0 and there is no need to call -	 * __clear_hugetlb_destructor as this was done previously. +	 * __folio_clear_hugetlb as this was done previously.  	 */  	VM_WARN_ON(!list_empty(folio_list));  	VM_WARN_ON(ret < 0);  	if (!list_empty(&non_hvo_folios) && ret) {  		spin_lock_irq(&hugetlb_lock);  		list_for_each_entry(folio, &non_hvo_folios, lru) -			__clear_hugetlb_destructor(h, folio); +			__folio_clear_hugetlb(folio);  		spin_unlock_irq(&hugetlb_lock);  	} @@ -1974,7 +1967,7 @@ void free_huge_folio(struct folio *folio)  {  	/*  	 * Can't pass hstate in here because it is called from the -	 * compound page destructor. +	 * generic mm code.  	 */  	struct hstate *h = folio_hstate(folio);  	int nid = folio_nid(folio); @@ -2031,7 +2024,7 @@ void free_huge_folio(struct folio *folio)  		spin_unlock_irqrestore(&hugetlb_lock, flags);  		update_and_free_hugetlb_folio(h, folio, true);  	} else { -		arch_clear_hugepage_flags(&folio->page); +		arch_clear_hugetlb_flags(folio);  		enqueue_hugetlb_folio(h, folio);  		spin_unlock_irqrestore(&hugetlb_lock, flags);  	} @@ -2049,7 +2042,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)  static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)  { -	folio_set_hugetlb(folio); +	__folio_set_hugetlb(folio);  	INIT_LIST_HEAD(&folio->lru);  	hugetlb_set_folio_subpool(folio, NULL);  	set_hugetlb_cgroup(folio, NULL); @@ -2124,10 +2117,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,  			set_compound_head(p, &folio->page);  	}  	__folio_set_head(folio); -	/* we rely on prep_new_hugetlb_folio to set the destructor */ +	/* we rely on prep_new_hugetlb_folio to set the hugetlb flag */  	folio_set_order(folio, order);  	atomic_set(&folio->_entire_mapcount, -1); -	atomic_set(&folio->_nr_pages_mapped, 0); +	atomic_set(&folio->_large_mapcount, -1);  	atomic_set(&folio->_pincount, 0);  	return true; @@ -2160,31 +2153,15 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,  }  /* - * PageHuge() only returns true for hugetlbfs pages, but not for normal or - * transparent huge pages.  See the PageTransHuge() documentation for more - * details. - */ -int PageHuge(const struct page *page) -{ -	const struct folio *folio; - -	if (!PageCompound(page)) -		return 0; -	folio = page_folio(page); -	return folio_test_hugetlb(folio); -} -EXPORT_SYMBOL_GPL(PageHuge); - -/*   * Find and lock address space (mapping) in write mode.   * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is   * stable.  Due to locking order, we can only trylock_write.  If we can   * not get the lock, simply return NULL to caller.   */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)  { -	struct address_space *mapping = page_mapping(hpage); +	struct address_space *mapping = folio_mapping(folio);  	if (!mapping)  		return mapping; @@ -2200,13 +2177,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,  		nodemask_t *node_alloc_noretry)  {  	int order = huge_page_order(h); -	struct page *page; +	struct folio *folio;  	bool alloc_try_hard = true;  	bool retry = true;  	/* -	 * By default we always try hard to allocate the page with -	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in +	 * By default we always try hard to allocate the folio with +	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in  	 * a loop (to adjust global huge page counts) and previous allocation  	 * failed, do not continue to try hard on the same node.  Use the  	 * node_alloc_noretry bitmap to manage this state information. @@ -2219,43 +2196,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,  	if (nid == NUMA_NO_NODE)  		nid = numa_mem_id();  retry: -	page = __alloc_pages(gfp_mask, order, nid, nmask); +	folio = __folio_alloc(gfp_mask, order, nid, nmask); -	/* Freeze head page */ -	if (page && !page_ref_freeze(page, 1)) { -		__free_pages(page, order); +	if (folio && !folio_ref_freeze(folio, 1)) { +		folio_put(folio);  		if (retry) {	/* retry once */  			retry = false;  			goto retry;  		}  		/* WOW!  twice in a row. */ -		pr_warn("HugeTLB head page unexpected inflated ref count\n"); -		page = NULL; +		pr_warn("HugeTLB unexpected inflated folio ref count\n"); +		folio = NULL;  	}  	/* -	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this -	 * indicates an overall state change.  Clear bit so that we resume -	 * normal 'try hard' allocations. +	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a +	 * folio this indicates an overall state change.  Clear bit so +	 * that we resume normal 'try hard' allocations.  	 */ -	if (node_alloc_noretry && page && !alloc_try_hard) +	if (node_alloc_noretry && folio && !alloc_try_hard)  		node_clear(nid, *node_alloc_noretry);  	/* -	 * If we tried hard to get a page but failed, set bit so that +	 * If we tried hard to get a folio but failed, set bit so that  	 * subsequent attempts will not try as hard until there is an  	 * overall state change.  	 */ -	if (node_alloc_noretry && !page && alloc_try_hard) +	if (node_alloc_noretry && !folio && alloc_try_hard)  		node_set(nid, *node_alloc_noretry); -	if (!page) { +	if (!folio) {  		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);  		return NULL;  	}  	__count_vm_event(HTLB_BUDDY_PGALLOC); -	return page_folio(page); +	return folio;  }  static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, @@ -2401,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,  }  /* - * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use hugepages and non-hugepages. + * Dissolve a given free hugetlb folio into free buddy pages. This function + * does nothing for in-use hugetlb folios and non-hugetlb folios.   * This function returns values like below:   *   *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages @@ -2414,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,   *       0:  successfully dissolved free hugepages or the page is not a   *           hugepage (considered as already dissolved)   */ -int dissolve_free_huge_page(struct page *page) +int dissolve_free_hugetlb_folio(struct folio *folio)  {  	int rc = -EBUSY; -	struct folio *folio = page_folio(page);  retry:  	/* Not to disrupt normal path by vainly holding hugetlb_lock */ @@ -2494,13 +2469,13 @@ out:   * make specified memory blocks removable from the system.   * Note that this will dissolve a free gigantic hugepage completely, if any   * part of it lies within the given range. - * Also note that if dissolve_free_huge_page() returns with an error, all - * free hugepages that were dissolved before that error are lost. + * Also note that if dissolve_free_hugetlb_folio() returns with an error, all + * free hugetlb folios that were dissolved before that error are lost.   */ -int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)  {  	unsigned long pfn; -	struct page *page; +	struct folio *folio;  	int rc = 0;  	unsigned int order;  	struct hstate *h; @@ -2513,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)  		order = min(order, huge_page_order(h));  	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { -		page = pfn_to_page(pfn); -		rc = dissolve_free_huge_page(page); +		folio = pfn_folio(pfn); +		rc = dissolve_free_hugetlb_folio(folio);  		if (rc)  			break;  	} @@ -2621,7 +2596,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,  /* folio migration callback function */  struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, -		nodemask_t *nmask, gfp_t gfp_mask) +		nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)  {  	spin_lock_irq(&hugetlb_lock);  	if (available_huge_pages(h)) { @@ -2636,6 +2611,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,  	}  	spin_unlock_irq(&hugetlb_lock); +	/* We cannot fallback to other nodes, as we could break the per-node pool. */ +	if (!allow_alloc_fallback) +		gfp_mask |= __GFP_THISNODE; +  	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);  } @@ -3268,9 +3247,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,  		rsv_adjust = hugepage_subpool_put_pages(spool, 1);  		hugetlb_acct_memory(h, -rsv_adjust); -		if (deferred_reserve) +		if (deferred_reserve) { +			spin_lock_irq(&hugetlb_lock);  			hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),  					pages_per_huge_page(h), folio); +			spin_unlock_irq(&hugetlb_lock); +		}  	}  	if (!memcg_charge_ret) @@ -5045,7 +5027,6 @@ static struct ctl_table hugetlb_table[] = {  		.mode		= 0644,  		.proc_handler	= hugetlb_overcommit_handler,  	}, -	{ }  };  static void hugetlb_sysctl_init(void) @@ -5936,19 +5917,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,   * cannot race with other handlers or page migration.   * Keep the pte_same checks anyway to make transition from the mutex easier.   */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, -		       unsigned long address, pte_t *ptep, unsigned int flags, -		       struct folio *pagecache_folio, spinlock_t *ptl, +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,  		       struct vm_fault *vmf)  { -	const bool unshare = flags & FAULT_FLAG_UNSHARE; -	pte_t pte = huge_ptep_get(ptep); +	struct vm_area_struct *vma = vmf->vma; +	struct mm_struct *mm = vma->vm_mm; +	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; +	pte_t pte = huge_ptep_get(vmf->pte);  	struct hstate *h = hstate_vma(vma);  	struct folio *old_folio;  	struct folio *new_folio;  	int outside_reserve = 0;  	vm_fault_t ret = 0; -	unsigned long haddr = address & huge_page_mask(h);  	struct mmu_notifier_range range;  	/* @@ -5971,7 +5951,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,  	/* Let's take out MAP_SHARED mappings first. */  	if (vma->vm_flags & VM_MAYSHARE) { -		set_huge_ptep_writable(vma, haddr, ptep); +		set_huge_ptep_writable(vma, vmf->address, vmf->pte);  		return 0;  	} @@ -5983,6 +5963,13 @@ retry_avoidcopy:  	/*  	 * If no-one else is actually using this page, we're the exclusive  	 * owner and can reuse this page. +	 * +	 * Note that we don't rely on the (safer) folio refcount here, because +	 * copying the hugetlb folio when there are unexpected (temporary) +	 * folio references could harm simple fork()+exit() users when +	 * we run out of free hugetlb folios: we would have to kill processes +	 * in scenarios that used to work. As a side effect, there can still +	 * be leaks between processes, for example, with FOLL_GET users.  	 */  	if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {  		if (!PageAnonExclusive(&old_folio->page)) { @@ -5990,7 +5977,7 @@ retry_avoidcopy:  			SetPageAnonExclusive(&old_folio->page);  		}  		if (likely(!unshare)) -			set_huge_ptep_writable(vma, haddr, ptep); +			set_huge_ptep_writable(vma, vmf->address, vmf->pte);  		delayacct_wpcopy_end();  		return 0; @@ -6017,8 +6004,8 @@ retry_avoidcopy:  	 * Drop page table lock as buddy allocator may be called. It will  	 * be acquired again before returning to the caller, as expected.  	 */ -	spin_unlock(ptl); -	new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); +	spin_unlock(vmf->ptl); +	new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);  	if (IS_ERR(new_folio)) {  		/* @@ -6043,19 +6030,21 @@ retry_avoidcopy:  			 *  			 * Reacquire both after unmap operation.  			 */ -			idx = vma_hugecache_offset(h, vma, haddr); +			idx = vma_hugecache_offset(h, vma, vmf->address);  			hash = hugetlb_fault_mutex_hash(mapping, idx);  			hugetlb_vma_unlock_read(vma);  			mutex_unlock(&hugetlb_fault_mutex_table[hash]); -			unmap_ref_private(mm, vma, &old_folio->page, haddr); +			unmap_ref_private(mm, vma, &old_folio->page, +					vmf->address);  			mutex_lock(&hugetlb_fault_mutex_table[hash]);  			hugetlb_vma_lock_read(vma); -			spin_lock(ptl); -			ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); -			if (likely(ptep && -				   pte_same(huge_ptep_get(ptep), pte))) +			spin_lock(vmf->ptl); +			vmf->pte = hugetlb_walk(vma, vmf->address, +					huge_page_size(h)); +			if (likely(vmf->pte && +				   pte_same(huge_ptep_get(vmf->pte), pte)))  				goto retry_avoidcopy;  			/*  			 * race occurs while re-acquiring page table @@ -6077,37 +6066,38 @@ retry_avoidcopy:  	if (unlikely(ret))  		goto out_release_all; -	if (copy_user_large_folio(new_folio, old_folio, address, vma)) { -		ret = VM_FAULT_HWPOISON_LARGE; +	if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { +		ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));  		goto out_release_all;  	}  	__folio_mark_uptodate(new_folio); -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, -				haddr + huge_page_size(h)); +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, +				vmf->address + huge_page_size(h));  	mmu_notifier_invalidate_range_start(&range);  	/*  	 * Retake the page table lock to check for racing updates  	 * before the page tables are altered  	 */ -	spin_lock(ptl); -	ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); -	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { +	spin_lock(vmf->ptl); +	vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); +	if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {  		pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);  		/* Break COW or unshare */ -		huge_ptep_clear_flush(vma, haddr, ptep); +		huge_ptep_clear_flush(vma, vmf->address, vmf->pte);  		hugetlb_remove_rmap(old_folio); -		hugetlb_add_new_anon_rmap(new_folio, vma, haddr); +		hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);  		if (huge_pte_uffd_wp(pte))  			newpte = huge_pte_mkuffd_wp(newpte); -		set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); +		set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, +				huge_page_size(h));  		folio_set_hugetlb_migratable(new_folio);  		/* Make the old page be freed below */  		new_folio = old_folio;  	} -	spin_unlock(ptl); +	spin_unlock(vmf->ptl);  	mmu_notifier_invalidate_range_end(&range);  out_release_all:  	/* @@ -6115,12 +6105,12 @@ out_release_all:  	 * unshare)  	 */  	if (new_folio != old_folio) -		restore_reserve_on_error(h, vma, haddr, new_folio); +		restore_reserve_on_error(h, vma, vmf->address, new_folio);  	folio_put(new_folio);  out_release_old:  	folio_put(old_folio); -	spin_lock(ptl); /* Caller expects lock to be held */ +	spin_lock(vmf->ptl); /* Caller expects lock to be held */  	delayacct_wpcopy_end();  	return ret; @@ -6129,8 +6119,8 @@ out_release_old:  /*   * Return whether there is a pagecache page to back given address within VMA.   */ -static bool hugetlbfs_pagecache_present(struct hstate *h, -			struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, +				 struct vm_area_struct *vma, unsigned long address)  {  	struct address_space *mapping = vma->vm_file->f_mapping;  	pgoff_t idx = linear_page_index(vma, address); @@ -6206,23 +6196,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,  	return same;  } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, -			struct vm_area_struct *vma, -			struct address_space *mapping, pgoff_t idx, -			unsigned long address, pte_t *ptep, -			pte_t old_pte, unsigned int flags, +static vm_fault_t hugetlb_no_page(struct address_space *mapping,  			struct vm_fault *vmf)  { +	struct vm_area_struct *vma = vmf->vma; +	struct mm_struct *mm = vma->vm_mm;  	struct hstate *h = hstate_vma(vma);  	vm_fault_t ret = VM_FAULT_SIGBUS;  	int anon_rmap = 0;  	unsigned long size;  	struct folio *folio;  	pte_t new_pte; -	spinlock_t *ptl; -	unsigned long haddr = address & huge_page_mask(h);  	bool new_folio, new_pagecache_folio = false; -	u32 hash = hugetlb_fault_mutex_hash(mapping, idx); +	u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);  	/*  	 * Currently, we are forced to kill the process in the event the @@ -6241,10 +6227,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  	 * before we get page_table_lock.  	 */  	new_folio = false; -	folio = filemap_lock_hugetlb_folio(h, mapping, idx); +	folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);  	if (IS_ERR(folio)) {  		size = i_size_read(mapping->host) >> huge_page_shift(h); -		if (idx >= size) +		if (vmf->pgoff >= size)  			goto out;  		/* Check for page in userfault range */  		if (userfaultfd_missing(vma)) { @@ -6265,7 +6251,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  			 * never happen on the page after UFFDIO_COPY has  			 * correctly installed the page and returned.  			 */ -			if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { +			if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {  				ret = 0;  				goto out;  			} @@ -6274,7 +6260,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  							VM_UFFD_MISSING);  		} -		folio = alloc_hugetlb_folio(vma, haddr, 0); +		if (!(vma->vm_flags & VM_MAYSHARE)) { +			ret = vmf_anon_prepare(vmf); +			if (unlikely(ret)) +				goto out; +		} + +		folio = alloc_hugetlb_folio(vma, vmf->address, 0);  		if (IS_ERR(folio)) {  			/*  			 * Returning error will result in faulting task being @@ -6288,18 +6280,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  			 * here.  Before returning error, get ptl and make  			 * sure there really is no pte entry.  			 */ -			if (hugetlb_pte_stable(h, mm, ptep, old_pte)) +			if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte))  				ret = vmf_error(PTR_ERR(folio));  			else  				ret = 0;  			goto out;  		} -		clear_huge_page(&folio->page, address, pages_per_huge_page(h)); +		clear_huge_page(&folio->page, vmf->real_address, +				pages_per_huge_page(h));  		__folio_mark_uptodate(folio);  		new_folio = true;  		if (vma->vm_flags & VM_MAYSHARE) { -			int err = hugetlb_add_to_page_cache(folio, mapping, idx); +			int err = hugetlb_add_to_page_cache(folio, mapping, +							vmf->pgoff);  			if (err) {  				/*  				 * err can't be -EEXIST which implies someone @@ -6308,17 +6302,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  				 * to the page cache. So it's safe to call  				 * restore_reserve_on_error() here.  				 */ -				restore_reserve_on_error(h, vma, haddr, folio); +				restore_reserve_on_error(h, vma, vmf->address, +							folio);  				folio_put(folio); +				ret = VM_FAULT_SIGBUS;  				goto out;  			}  			new_pagecache_folio = true;  		} else {  			folio_lock(folio); - -			ret = vmf_anon_prepare(vmf); -			if (unlikely(ret)) -				goto backout_unlocked;  			anon_rmap = 1;  		}  	} else { @@ -6338,7 +6330,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  			folio_unlock(folio);  			folio_put(folio);  			/* See comment in userfaultfd_missing() block above */ -			if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { +			if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {  				ret = 0;  				goto out;  			} @@ -6353,23 +6345,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  	 * any allocations necessary to record that reservation occur outside  	 * the spinlock.  	 */ -	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { -		if (vma_needs_reservation(h, vma, haddr) < 0) { +	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { +		if (vma_needs_reservation(h, vma, vmf->address) < 0) {  			ret = VM_FAULT_OOM;  			goto backout_unlocked;  		}  		/* Just decrements count, does not deallocate */ -		vma_end_reservation(h, vma, haddr); +		vma_end_reservation(h, vma, vmf->address);  	} -	ptl = huge_pte_lock(h, mm, ptep); +	vmf->ptl = huge_pte_lock(h, mm, vmf->pte);  	ret = 0;  	/* If pte changed from under us, retry */ -	if (!pte_same(huge_ptep_get(ptep), old_pte)) +	if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte))  		goto backout;  	if (anon_rmap) -		hugetlb_add_new_anon_rmap(folio, vma, haddr); +		hugetlb_add_new_anon_rmap(folio, vma, vmf->address);  	else  		hugetlb_add_file_rmap(folio);  	new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6378,17 +6370,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,  	 * If this pte was previously wr-protected, keep it wr-protected even  	 * if populated.  	 */ -	if (unlikely(pte_marker_uffd_wp(old_pte))) +	if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))  		new_pte = huge_pte_mkuffd_wp(new_pte); -	set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); +	set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));  	hugetlb_count_add(pages_per_huge_page(h), mm); -	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { +	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {  		/* Optimization, do the COW without a second fault */ -		ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf); +		ret = hugetlb_wp(folio, vmf);  	} -	spin_unlock(ptl); +	spin_unlock(vmf->ptl);  	/*  	 * Only set hugetlb_migratable in newly allocated pages.  Existing pages @@ -6405,10 +6397,10 @@ out:  	return ret;  backout: -	spin_unlock(ptl); +	spin_unlock(vmf->ptl);  backout_unlocked:  	if (new_folio && !new_pagecache_folio) -		restore_reserve_on_error(h, vma, haddr, folio); +		restore_reserve_on_error(h, vma, vmf->address, folio);  	folio_unlock(folio);  	folio_put(folio); @@ -6442,8 +6434,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			unsigned long address, unsigned int flags)  { -	pte_t *ptep, entry; -	spinlock_t *ptl;  	vm_fault_t ret;  	u32 hash;  	struct folio *folio = NULL; @@ -6451,13 +6441,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	struct hstate *h = hstate_vma(vma);  	struct address_space *mapping;  	int need_wait_lock = 0; -	unsigned long haddr = address & huge_page_mask(h);  	struct vm_fault vmf = {  		.vma = vma, -		.address = haddr, +		.address = address & huge_page_mask(h),  		.real_address = address,  		.flags = flags, -		.pgoff = vma_hugecache_offset(h, vma, haddr), +		.pgoff = vma_hugecache_offset(h, vma, +				address & huge_page_mask(h)),  		/* TODO: Track hugetlb faults using vm_fault */  		/* @@ -6477,25 +6467,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	/*  	 * Acquire vma lock before calling huge_pte_alloc and hold -	 * until finished with ptep.  This prevents huge_pmd_unshare from -	 * being called elsewhere and making the ptep no longer valid. +	 * until finished with vmf.pte.  This prevents huge_pmd_unshare from +	 * being called elsewhere and making the vmf.pte no longer valid.  	 */  	hugetlb_vma_lock_read(vma); -	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); -	if (!ptep) { +	vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); +	if (!vmf.pte) {  		hugetlb_vma_unlock_read(vma);  		mutex_unlock(&hugetlb_fault_mutex_table[hash]);  		return VM_FAULT_OOM;  	} -	entry = huge_ptep_get(ptep); -	if (huge_pte_none_mostly(entry)) { -		if (is_pte_marker(entry)) { +	vmf.orig_pte = huge_ptep_get(vmf.pte); +	if (huge_pte_none_mostly(vmf.orig_pte)) { +		if (is_pte_marker(vmf.orig_pte)) {  			pte_marker marker = -				pte_marker_get(pte_to_swp_entry(entry)); +				pte_marker_get(pte_to_swp_entry(vmf.orig_pte));  			if (marker & PTE_MARKER_POISONED) { -				ret = VM_FAULT_HWPOISON_LARGE; +				ret = VM_FAULT_HWPOISON_LARGE | +				      VM_FAULT_SET_HINDEX(hstate_index(h));  				goto out_mutex;  			}  		} @@ -6506,21 +6497,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  		 * hugetlb_no_page will drop vma lock and hugetlb fault  		 * mutex internally, which make us return immediately.  		 */ -		return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address, -					ptep, entry, flags, &vmf); +		return hugetlb_no_page(mapping, &vmf);  	}  	ret = 0;  	/* -	 * entry could be a migration/hwpoison entry at this point, so this -	 * check prevents the kernel from going below assuming that we have -	 * an active hugepage in pagecache. This goto expects the 2nd page -	 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will -	 * properly handle it. +	 * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this +	 * point, so this check prevents the kernel from going below assuming +	 * that we have an active hugepage in pagecache. This goto expects +	 * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) +	 * check will properly handle it.  	 */ -	if (!pte_present(entry)) { -		if (unlikely(is_hugetlb_entry_migration(entry))) { +	if (!pte_present(vmf.orig_pte)) { +		if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {  			/*  			 * Release the hugetlb fault lock now, but retain  			 * the vma lock, because it is needed to guard the @@ -6529,9 +6519,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			 * be released there.  			 */  			mutex_unlock(&hugetlb_fault_mutex_table[hash]); -			migration_entry_wait_huge(vma, ptep); +			migration_entry_wait_huge(vma, vmf.pte);  			return 0; -		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) +		} else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))  			ret = VM_FAULT_HWPOISON_LARGE |  			    VM_FAULT_SET_HINDEX(hstate_index(h));  		goto out_mutex; @@ -6545,13 +6535,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	 * determine if a reservation has been consumed.  	 */  	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && -	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { -		if (vma_needs_reservation(h, vma, haddr) < 0) { +	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { +		if (vma_needs_reservation(h, vma, vmf.address) < 0) {  			ret = VM_FAULT_OOM;  			goto out_mutex;  		}  		/* Just decrements count, does not deallocate */ -		vma_end_reservation(h, vma, haddr); +		vma_end_reservation(h, vma, vmf.address);  		pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,  							     vmf.pgoff); @@ -6559,17 +6549,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			pagecache_folio = NULL;  	} -	ptl = huge_pte_lock(h, mm, ptep); +	vmf.ptl = huge_pte_lock(h, mm, vmf.pte);  	/* Check for a racing update before calling hugetlb_wp() */ -	if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) +	if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))  		goto out_ptl;  	/* Handle userfault-wp first, before trying to lock more pages */ -	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && -	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { +	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && +	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {  		if (!userfaultfd_wp_async(vma)) { -			spin_unlock(ptl); +			spin_unlock(vmf.ptl);  			if (pagecache_folio) {  				folio_unlock(pagecache_folio);  				folio_put(pagecache_folio); @@ -6579,18 +6569,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  			return handle_userfault(&vmf, VM_UFFD_WP);  		} -		entry = huge_pte_clear_uffd_wp(entry); -		set_huge_pte_at(mm, haddr, ptep, entry, +		vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); +		set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,  				huge_page_size(hstate_vma(vma)));  		/* Fallthrough to CoW */  	}  	/* -	 * hugetlb_wp() requires page locks of pte_page(entry) and +	 * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and  	 * pagecache_folio, so here we need take the former one  	 * when folio != pagecache_folio or !pagecache_folio.  	 */ -	folio = page_folio(pte_page(entry)); +	folio = page_folio(pte_page(vmf.orig_pte));  	if (folio != pagecache_folio)  		if (!folio_trylock(folio)) {  			need_wait_lock = 1; @@ -6600,24 +6590,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,  	folio_get(folio);  	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { -		if (!huge_pte_write(entry)) { -			ret = hugetlb_wp(mm, vma, address, ptep, flags, -					 pagecache_folio, ptl, &vmf); +		if (!huge_pte_write(vmf.orig_pte)) { +			ret = hugetlb_wp(pagecache_folio, &vmf);  			goto out_put_page;  		} else if (likely(flags & FAULT_FLAG_WRITE)) { -			entry = huge_pte_mkdirty(entry); +			vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);  		}  	} -	entry = pte_mkyoung(entry); -	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, +	vmf.orig_pte = pte_mkyoung(vmf.orig_pte); +	if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,  						flags & FAULT_FLAG_WRITE)) -		update_mmu_cache(vma, haddr, ptep); +		update_mmu_cache(vma, vmf.address, vmf.pte);  out_put_page:  	if (folio != pagecache_folio)  		folio_unlock(folio);  	folio_put(folio);  out_ptl: -	spin_unlock(ptl); +	spin_unlock(vmf.ptl);  	if (pagecache_folio) {  		folio_unlock(pagecache_folio); @@ -6653,7 +6642,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,  	gfp_mask = htlb_alloc_mask(h);  	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); -	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); +	/* +	 * This is used to allocate a temporary hugetlb to hold the copied +	 * content, which will then be copied again to the final hugetlb +	 * consuming a reservation. Set the alloc_fallback to false to indicate +	 * that breaking the per-node hugetlb pool is not allowed in this case. +	 */ +	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);  	mpol_cond_put(mpol);  	return folio; @@ -6883,77 +6878,6 @@ out_release_nounlock:  }  #endif /* CONFIG_USERFAULTFD */ -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, -				      unsigned long address, unsigned int flags, -				      unsigned int *page_mask) -{ -	struct hstate *h = hstate_vma(vma); -	struct mm_struct *mm = vma->vm_mm; -	unsigned long haddr = address & huge_page_mask(h); -	struct page *page = NULL; -	spinlock_t *ptl; -	pte_t *pte, entry; -	int ret; - -	hugetlb_vma_lock_read(vma); -	pte = hugetlb_walk(vma, haddr, huge_page_size(h)); -	if (!pte) -		goto out_unlock; - -	ptl = huge_pte_lock(h, mm, pte); -	entry = huge_ptep_get(pte); -	if (pte_present(entry)) { -		page = pte_page(entry); - -		if (!huge_pte_write(entry)) { -			if (flags & FOLL_WRITE) { -				page = NULL; -				goto out; -			} - -			if (gup_must_unshare(vma, flags, page)) { -				/* Tell the caller to do unsharing */ -				page = ERR_PTR(-EMLINK); -				goto out; -			} -		} - -		page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); - -		/* -		 * Note that page may be a sub-page, and with vmemmap -		 * optimizations the page struct may be read only. -		 * try_grab_page() will increase the ref count on the -		 * head page, so this will be OK. -		 * -		 * try_grab_page() should always be able to get the page here, -		 * because we hold the ptl lock and have verified pte_present(). -		 */ -		ret = try_grab_page(page, flags); - -		if (WARN_ON_ONCE(ret)) { -			page = ERR_PTR(ret); -			goto out; -		} - -		*page_mask = (1U << huge_page_order(h)) - 1; -	} -out: -	spin_unlock(ptl); -out_unlock: -	hugetlb_vma_unlock_read(vma); - -	/* -	 * Fixup retval for dump requests: if pagecache doesn't exist, -	 * don't try to allocate a new page but just skip it. -	 */ -	if (!page && (flags & FOLL_DUMP) && -	    !hugetlbfs_pagecache_present(h, vma, address)) -		page = ERR_PTR(-EFAULT); - -	return page; -} -  long hugetlb_change_protection(struct vm_area_struct *vma,  		unsigned long address, unsigned long end,  		pgprot_t newprot, unsigned long cp_flags) @@ -7044,9 +6968,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma,  			if (!pte_same(pte, newpte))  				set_huge_pte_at(mm, address, ptep, newpte, psize);  		} else if (unlikely(is_pte_marker(pte))) { -			/* No other markers apply for now. */ -			WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); -			if (uffd_wp_resolve) +			/* +			 * Do nothing on a poison marker; page is +			 * corrupted, permissons do not apply.  Here +			 * pte_marker_uffd_wp()==true implies !poison +			 * because they're mutual exclusive. +			 */ +			if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)  				/* Safe to modify directly (non-present->none). */  				huge_pte_clear(mm, address, ptep, psize);  		} else if (!huge_pte_none(pte)) { @@ -7873,9 +7801,9 @@ void __init hugetlb_cma_reserve(int order)  		 * huge page demotion.  		 */  		res = cma_declare_contiguous_nid(0, size, 0, -						PAGE_SIZE << HUGETLB_PAGE_ORDER, -						 0, false, name, -						 &hugetlb_cma[nid], nid); +					PAGE_SIZE << order, +					HUGETLB_PAGE_ORDER, false, name, +					&hugetlb_cma[nid], nid);  		if (res) {  			pr_warn("hugetlb_cma: reservation failed: err %d, node %d",  				res, nid);  |