diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 331 | 
1 files changed, 178 insertions, 153 deletions
diff --git a/mm/memory.c b/mm/memory.c index d2155ced45f8..0f47a533014e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf);   * Return true if the original pte was a uffd-wp pte marker (so the pte was   * wr-protected).   */ -static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) +static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)  { +	if (!userfaultfd_wp(vmf->vma)) +		return false;  	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))  		return false; @@ -989,7 +991,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma  			flags |= FPB_IGNORE_SOFT_DIRTY;  		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, -				     &any_writable); +				     &any_writable, NULL, NULL);  		folio_ref_add(folio, nr);  		if (folio_test_anon(folio)) {  			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1502,10 +1504,15 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,  	if (!delay_rmap) {  		folio_remove_rmap_ptes(folio, page, nr, vma); -		/* Only sanity-check the first page in a batch. */ -		if (unlikely(page_mapcount(page) < 0)) +		if (unlikely(folio_mapcount(folio) < 0))  			print_bad_pte(vma, addr, ptent, page);  	} + +	if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && +	    !delay_rmap && folio_test_anon(folio)) { +		kernel_init_pages(page, folio_nr_pages(folio)); +	} +  	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {  		*force_flush = true;  		*force_break = true; @@ -1553,7 +1560,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,  	 */  	if (unlikely(folio_test_large(folio) && max_nr != 1)) {  		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, -				     NULL); +				     NULL, NULL, NULL);  		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,  				       addr, details, rss, force_flush, @@ -1631,12 +1638,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  				folio_remove_rmap_pte(folio, page, vma);  			folio_put(folio);  		} else if (!non_swap_entry(entry)) { -			/* Genuine swap entry, hence a private anon page */ +			max_nr = (end - addr) / PAGE_SIZE; +			nr = swap_pte_batch(pte, max_nr, ptent); +			/* Genuine swap entries, hence a private anon pages */  			if (!should_zap_cows(details))  				continue; -			rss[MM_SWAPENTS]--; -			if (unlikely(!free_swap_and_cache(entry))) -				print_bad_pte(vma, addr, ptent, NULL); +			rss[MM_SWAPENTS] -= nr; +			free_swap_and_cache_nr(entry, nr);  		} else if (is_migration_entry(entry)) {  			folio = pfn_swap_entry_folio(entry);  			if (!should_zap_folio(details, folio)) @@ -1659,8 +1667,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,  			pr_alert("unrecognized swap entry 0x%lx\n", entry.val);  			WARN_ON_ONCE(1);  		} -		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); -		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); +		clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); +		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);  	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);  	add_mm_rss_vec(mm, rss); @@ -2765,7 +2773,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,  	unsigned long next;  	int err = 0; -	BUG_ON(pud_huge(*pud)); +	BUG_ON(pud_leaf(*pud));  	if (create) {  		pmd = pmd_alloc_track(mm, pud, addr, mask); @@ -3206,19 +3214,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)  	return VM_FAULT_RETRY;  } +/** + * vmf_anon_prepare - Prepare to handle an anonymous fault. + * @vmf: The vm_fault descriptor passed from the fault handler. + * + * When preparing to insert an anonymous page into a VMA from a + * fault handler, call this function rather than anon_vma_prepare(). + * If this vma does not already have an associated anon_vma and we are + * only protected by the per-VMA lock, the caller must retry with the + * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to + * determine if this VMA can share its anon_vma, and that's not safe to + * do with only the per-VMA lock held for this VMA. + * + * Return: 0 if fault handling can proceed.  Any other value should be + * returned to the caller. + */  vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)  {  	struct vm_area_struct *vma = vmf->vma; +	vm_fault_t ret = 0;  	if (likely(vma->anon_vma))  		return 0;  	if (vmf->flags & FAULT_FLAG_VMA_LOCK) { -		vma_end_read(vma); -		return VM_FAULT_RETRY; +		if (!mmap_read_trylock(vma->vm_mm)) { +			vma_end_read(vma); +			return VM_FAULT_RETRY; +		}  	}  	if (__anon_vma_prepare(vma)) -		return VM_FAULT_OOM; -	return 0; +		ret = VM_FAULT_OOM; +	if (vmf->flags & FAULT_FLAG_VMA_LOCK) +		mmap_read_unlock(vma->vm_mm); +	return ret;  }  /* @@ -3329,13 +3357,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)  		ptep_clear_flush(vma, vmf->address, vmf->pte);  		folio_add_new_anon_rmap(new_folio, vma, vmf->address);  		folio_add_lru_vma(new_folio, vma); -		/* -		 * We call the notify macro here because, when using secondary -		 * mmu page tables (such as kvm shadow page tables), we want the -		 * new page to be mapped directly into the secondary page table. -		 */  		BUG_ON(unshare && pte_write(entry)); -		set_pte_at_notify(mm, vmf->address, vmf->pte, entry); +		set_pte_at(mm, vmf->address, vmf->pte, entry);  		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);  		if (old_folio) {  			/* @@ -4190,7 +4213,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)  	 * when reading from swap. This metadata may be indexed by swap entry  	 * so this must be called before swap_free().  	 */ -	arch_swap_restore(entry, folio); +	arch_swap_restore(folio_swap(entry, folio), folio);  	/*  	 * Remove the swap entry and conditionally try to free up the swapcache. @@ -4326,8 +4349,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)  	 * for this vma. Then filter out the orders that can't be allocated over  	 * the faulting address and still be fully contained in the vma.  	 */ -	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, -					  BIT(PMD_ORDER) - 1); +	orders = thp_vma_allowable_orders(vma, vma->vm_flags, +			TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);  	orders = thp_vma_suitable_orders(vma, vmf->address, orders);  	if (!orders) @@ -4352,6 +4375,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)  	pte_unmap(pte); +	if (!orders) +		goto fallback; +  	/* Try allocating the highest of the remaining orders. */  	gfp = vma_thp_gfp_mask(vma);  	while (orders) { @@ -4359,6 +4385,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)  		folio = vma_alloc_folio(gfp, order, vma, addr, true);  		if (folio) {  			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { +				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);  				folio_put(folio);  				goto next;  			} @@ -4367,6 +4394,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)  			return folio;  		}  next: +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);  		order = next_order(&orders, order);  	} @@ -4382,7 +4410,6 @@ fallback:   */  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)  { -	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);  	struct vm_area_struct *vma = vmf->vma;  	unsigned long addr = vmf->address;  	struct folio *folio; @@ -4427,8 +4454,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)  	}  	/* Allocate our own private page. */ -	if (unlikely(anon_vma_prepare(vma))) -		goto oom; +	ret = vmf_anon_prepare(vmf); +	if (ret) +		return ret;  	/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */  	folio = alloc_anon_folio(vmf);  	if (IS_ERR(folio)) @@ -4476,10 +4504,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)  	folio_ref_add(folio, nr_pages - 1);  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); +#endif  	folio_add_new_anon_rmap(folio, vma, addr);  	folio_add_lru_vma(folio, vma);  setpte: -	if (uffd_wp) +	if (vmf_orig_pte_uffd_wp(vmf))  		entry = pte_mkuffd_wp(entry);  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); @@ -4654,7 +4685,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,  		struct page *page, unsigned int nr, unsigned long addr)  {  	struct vm_area_struct *vma = vmf->vma; -	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);  	bool write = vmf->flags & FAULT_FLAG_WRITE;  	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);  	pte_t entry; @@ -4669,16 +4699,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,  	if (write)  		entry = maybe_mkwrite(pte_mkdirty(entry), vma); -	if (unlikely(uffd_wp)) +	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))  		entry = pte_mkuffd_wp(entry);  	/* copy-on-write page */  	if (write && !(vma->vm_flags & VM_SHARED)) { -		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);  		VM_BUG_ON_FOLIO(nr != 1, folio);  		folio_add_new_anon_rmap(folio, vma, addr);  		folio_add_lru_vma(folio, vma);  	} else { -		add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);  		folio_add_file_rmap_ptes(folio, page, nr, vma);  	}  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); @@ -4715,9 +4743,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)  	struct vm_area_struct *vma = vmf->vma;  	struct page *page;  	vm_fault_t ret; +	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && +		      !(vma->vm_flags & VM_SHARED);  	/* Did we COW the page? */ -	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) +	if (is_cow)  		page = vmf->cow_page;  	else  		page = vmf->page; @@ -4753,8 +4783,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)  	/* Re-check under ptl */  	if (likely(!vmf_pte_changed(vmf))) {  		struct folio *folio = page_folio(page); +		int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);  		set_pte_range(vmf, folio, page, 1, vmf->address); +		add_mm_counter(vma->vm_mm, type, 1);  		ret = 0;  	} else {  		update_mmu_tlb(vma, vmf->address, vmf->pte); @@ -5035,9 +5067,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)  	return ret;  } -int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,  		      unsigned long addr, int page_nid, int *flags)  { +	struct vm_area_struct *vma = vmf->vma; +  	folio_get(folio);  	/* Record the current PID acceesing VMA */ @@ -5049,7 +5083,55 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,  		*flags |= TNF_FAULT_LOCAL;  	} -	return mpol_misplaced(folio, vma, addr); +	return mpol_misplaced(folio, vmf, addr); +} + +static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, +					unsigned long fault_addr, pte_t *fault_pte, +					bool writable) +{ +	pte_t pte, old_pte; + +	old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); +	pte = pte_modify(old_pte, vma->vm_page_prot); +	pte = pte_mkyoung(pte); +	if (writable) +		pte = pte_mkwrite(pte, vma); +	ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); +	update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); +} + +static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, +				       struct folio *folio, pte_t fault_pte, +				       bool ignore_writable, bool pte_write_upgrade) +{ +	int nr = pte_pfn(fault_pte) - folio_pfn(folio); +	unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start); +	unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end); +	pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE; +	unsigned long addr; + +	/* Restore all PTEs' mapping of the large folio */ +	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { +		pte_t ptent = ptep_get(start_ptep); +		bool writable = false; + +		if (!pte_present(ptent) || !pte_protnone(ptent)) +			continue; + +		if (pfn_folio(pte_pfn(ptent)) != folio) +			continue; + +		if (!ignore_writable) { +			ptent = pte_modify(ptent, vma->vm_page_prot); +			writable = pte_write(ptent); +			if (!writable && pte_write_upgrade && +			    can_change_pte_writable(vma, addr, ptent)) +				writable = true; +		} + +		numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); +	}  }  static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -5057,11 +5139,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  	struct vm_area_struct *vma = vmf->vma;  	struct folio *folio = NULL;  	int nid = NUMA_NO_NODE; -	bool writable = false; +	bool writable = false, ignore_writable = false; +	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);  	int last_cpupid;  	int target_nid;  	pte_t pte, old_pte; -	int flags = 0; +	int flags = 0, nr_pages;  	/*  	 * The pte cannot be used safely until we verify, while holding the page @@ -5083,7 +5166,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  	 * is only valid while holding the PT lock.  	 */  	writable = pte_write(pte); -	if (!writable && vma_wants_manual_pte_write_upgrade(vma) && +	if (!writable && pte_write_upgrade &&  	    can_change_pte_writable(vma, vmf->address, pte))  		writable = true; @@ -5091,10 +5174,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  	if (!folio || folio_is_zone_device(folio))  		goto out_map; -	/* TODO: handle PTE-mapped THP */ -	if (folio_test_large(folio)) -		goto out_map; -  	/*  	 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as  	 * much anyway since they can be in shared cache state. This misses @@ -5110,10 +5189,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  	 * Flag if the folio is shared between multiple address spaces. This  	 * is later used when determining whether to group tasks together  	 */ -	if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) +	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))  		flags |= TNF_SHARED;  	nid = folio_nid(folio); +	nr_pages = folio_nr_pages(folio);  	/*  	 * For memory tiering mode, cpupid of slow memory page is used  	 * to record page access time.  So use default value. @@ -5123,13 +5203,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  		last_cpupid = (-1 & LAST_CPUPID_MASK);  	else  		last_cpupid = folio_last_cpupid(folio); -	target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); +	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);  	if (target_nid == NUMA_NO_NODE) {  		folio_put(folio);  		goto out_map;  	}  	pte_unmap_unlock(vmf->pte, vmf->ptl);  	writable = false; +	ignore_writable = true;  	/* Migrate to the requested node */  	if (migrate_misplaced_folio(folio, vma, target_nid)) { @@ -5150,20 +5231,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)  out:  	if (nid != NUMA_NO_NODE) -		task_numa_fault(last_cpupid, nid, 1, flags); +		task_numa_fault(last_cpupid, nid, nr_pages, flags);  	return 0;  out_map:  	/*  	 * Make it present again, depending on how arch implements  	 * non-accessible ptes, some can allow access by kernel mode.  	 */ -	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); -	pte = pte_modify(old_pte, vma->vm_page_prot); -	pte = pte_mkyoung(pte); -	if (writable) -		pte = pte_mkwrite(pte, vma); -	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); -	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); +	if (folio && folio_test_large(folio)) +		numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, +					   pte_write_upgrade); +	else +		numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, +					    writable);  	pte_unmap_unlock(vmf->pte, vmf->ptl);  	goto out;  } @@ -5374,7 +5454,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,  		return VM_FAULT_OOM;  retry_pud:  	if (pud_none(*vmf.pud) && -	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { +	    thp_vma_allowable_order(vma, vm_flags, +				TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {  		ret = create_huge_pud(&vmf);  		if (!(ret & VM_FAULT_FALLBACK))  			return ret; @@ -5408,7 +5489,8 @@ retry_pud:  		goto retry_pud;  	if (pmd_none(*vmf.pmd) && -	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { +	    thp_vma_allowable_order(vma, vm_flags, +				TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {  		ret = create_huge_pmd(&vmf);  		if (!(ret & VM_FAULT_FALLBACK))  			return ret; @@ -5762,15 +5844,6 @@ retry:  	if (!vma_start_read(vma))  		goto inval; -	/* -	 * find_mergeable_anon_vma uses adjacent vmas which are not locked. -	 * This check must happen after vma_start_read(); otherwise, a -	 * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA -	 * from its anon_vma. -	 */ -	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) -		goto inval_end_read; -  	/* Check since vm_start/vm_end might change before we lock the VMA */  	if (unlikely(address < vma->vm_start || address >= vma->vm_end))  		goto inval_end_read; @@ -5868,34 +5941,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)  /**   * follow_pte - look up PTE at a user virtual address - * @mm: the mm_struct of the target address space + * @vma: the memory mapping   * @address: user virtual address   * @ptepp: location to store found PTE   * @ptlp: location to store the lock for the PTE   *   * On a successful return, the pointer to the PTE is stored in @ptepp;   * the corresponding lock is taken and its location is stored in @ptlp. - * The contents of the PTE are only stable until @ptlp is released; - * any further use, if any, must be protected against invalidation - * with MMU notifiers. + * + * The contents of the PTE are only stable until @ptlp is released using + * pte_unmap_unlock(). This function will fail if the PTE is non-present. + * Present PTEs may include PTEs that map refcounted pages, such as + * anonymous folios in COW mappings. + * + * Callers must be careful when relying on PTE content after + * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, + * callers must protect against invalidation with MMU notifiers; otherwise + * access to the PFN at a later point in time can trigger use-after-free.   *   * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore   * should be taken for read.   * - * KVM uses this function.  While it is arguably less bad than ``follow_pfn``, - * it is not a good general-purpose API. + * This function must not be used to modify PTE content.   *   * Return: zero on success, -ve otherwise.   */ -int follow_pte(struct mm_struct *mm, unsigned long address, +int follow_pte(struct vm_area_struct *vma, unsigned long address,  	       pte_t **ptepp, spinlock_t **ptlp)  { +	struct mm_struct *mm = vma->vm_mm;  	pgd_t *pgd;  	p4d_t *p4d;  	pud_t *pud;  	pmd_t *pmd;  	pte_t *ptep; +	mmap_assert_locked(mm); +	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) +		goto out; + +	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) +		goto out; +  	pgd = pgd_offset(mm, address);  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))  		goto out; @@ -5925,71 +6012,7 @@ out:  }  EXPORT_SYMBOL_GPL(follow_pte); -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * This function does not allow the caller to read the permissions - * of the PTE.  Do not use it. - * - * Return: zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, -	unsigned long *pfn) -{ -	int ret = -EINVAL; -	spinlock_t *ptl; -	pte_t *ptep; - -	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) -		return ret; - -	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); -	if (ret) -		return ret; -	*pfn = pte_pfn(ptep_get(ptep)); -	pte_unmap_unlock(ptep, ptl); -	return 0; -} -EXPORT_SYMBOL(follow_pfn); -  #ifdef CONFIG_HAVE_IOREMAP_PROT -int follow_phys(struct vm_area_struct *vma, -		unsigned long address, unsigned int flags, -		unsigned long *prot, resource_size_t *phys) -{ -	int ret = -EINVAL; -	pte_t *ptep, pte; -	spinlock_t *ptl; - -	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) -		goto out; - -	if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) -		goto out; -	pte = ptep_get(ptep); - -	/* Never return PFNs of anon folios in COW mappings. */ -	if (vm_normal_folio(vma, address, pte)) -		goto unlock; - -	if ((flags & FOLL_WRITE) && !pte_write(pte)) -		goto unlock; - -	*prot = pgprot_val(pte_pgprot(pte)); -	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - -	ret = 0; -unlock: -	pte_unmap_unlock(ptep, ptl); -out: -	return ret; -} -  /**   * generic_access_phys - generic implementation for iomem mmap access   * @vma: the vma to access @@ -6013,11 +6036,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,  	int offset = offset_in_page(addr);  	int ret = -EINVAL; -	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) -		return -EINVAL; -  retry: -	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) +	if (follow_pte(vma, addr, &ptep, &ptl))  		return -EINVAL;  	pte = ptep_get(ptep);  	pte_unmap_unlock(ptep, ptl); @@ -6032,7 +6052,7 @@ retry:  	if (!maddr)  		return -ENOMEM; -	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) +	if (follow_pte(vma, addr, &ptep, &ptl))  		goto out_unmap;  	if (!pte_same(pte, ptep_get(ptep))) { @@ -6190,21 +6210,14 @@ void print_vma_addr(char *prefix, unsigned long ip)  	if (!mmap_read_trylock(mm))  		return; -	vma = find_vma(mm, ip); +	vma = vma_lookup(mm, ip);  	if (vma && vma->vm_file) {  		struct file *f = vma->vm_file; -		char *buf = (char *)__get_free_page(GFP_NOWAIT); -		if (buf) { -			char *p; - -			p = file_path(f, buf, PAGE_SIZE); -			if (IS_ERR(p)) -				p = "?"; -			printk("%s%s[%lx+%lx]", prefix, kbasename(p), -					vma->vm_start, -					vma->vm_end - vma->vm_start); -			free_page((unsigned long)buf); -		} +		ip -= vma->vm_start; +		ip += vma->vm_pgoff << PAGE_SHIFT; +		printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, +				vma->vm_start, +				vma->vm_end - vma->vm_start);  	}  	mmap_read_unlock(mm);  } @@ -6440,3 +6453,15 @@ void ptlock_free(struct ptdesc *ptdesc)  	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);  }  #endif + +void vma_pgtable_walk_begin(struct vm_area_struct *vma) +{ +	if (is_vm_hugetlb_page(vma)) +		hugetlb_vma_lock_read(vma); +} + +void vma_pgtable_walk_end(struct vm_area_struct *vma) +{ +	if (is_vm_hugetlb_page(vma)) +		hugetlb_vma_unlock_read(vma); +}  |