From 906632843d00a4a42072b19c423b30a9e7adef3c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:43 +0800 Subject: mm: remove MIGRATE_SYNC_NO_COPY mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2916ecc0f9d4 ("mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY") introduce a new MIGRATE_SYNC_NO_COPY mode to allow to offload the copy to a device DMA engine, which is only used __migrate_device_pages() to decide whether or not copy the old page, and the MIGRATE_SYNC_NO_COPY mode only set in hmm, as the MIGRATE_SYNC_NO_COPY set is removed by previous cleanup, it seems that we could remove the unnecessary MIGRATE_SYNC_NO_COPY. Link: https://lkml.kernel.org/r/20240524052843.182275-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 412f295acebe..6df794ed4066 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1128,10 +1128,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, hugetlb_set_folio_subpool(src, NULL); } - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); + folio_migrate_copy(dst, src); return MIGRATEPAGE_SUCCESS; } -- cgit From 78fefd04c123493bbf28434768fa577b2153c79b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Jun 2024 17:12:39 +0800 Subject: mm: memory: convert clear_huge_page() to folio_zero_user() Patch series "mm: improve clear and copy user folio", v2. Some folio conversions. An improvement is to move address alignment into the caller as it is only needed if we don't know which address will be accessed when clearing/copying user folios. This patch (of 4): Replace clear_huge_page() with folio_zero_user(), and take a folio instead of a page. Directly get number of pages by folio_nr_pages() to remove pages_per_huge_page argument, furthermore, move the address alignment from folio_zero_user() to the callers since the alignment is only needed when we don't know which address will be accessed. Link: https://lkml.kernel.org/r/20240618091242.2140164-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240618091242.2140164-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/mm.h | 4 +--- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 3 +-- mm/memory.c | 34 ++++++++++++++++------------------ 5 files changed, 21 insertions(+), 26 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6df794ed4066..9456e1d55540 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -892,7 +892,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, error = PTR_ERR(folio); goto out; } - clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 101945baffc7..e2140ea6ae98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4067,9 +4067,7 @@ enum mf_action_page_type { }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 14a05c643806..be598e9a5f98 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -944,10 +944,10 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, goto release; } - clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + folio_zero_user(folio, vmf->address); /* * The memory barrier inside __folio_mark_uptodate makes sure that - * clear_huge_page writes become visible before the set_pmd_at() + * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6a5ea898e4da..a47f8c6c37c2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6300,8 +6300,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, ret = 0; goto out; } - clear_huge_page(&folio->page, vmf->real_address, - pages_per_huge_page(h)); + folio_zero_user(folio, vmf->real_address); __folio_mark_uptodate(folio); new_folio = true; diff --git a/mm/memory.c b/mm/memory.c index 97ddba866e43..cb26f8713db4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4488,7 +4488,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) goto next; } folio_throttle_swaprate(folio, gfp); - clear_huge_page(&folio->page, vmf->address, 1 << order); + folio_zero_user(folio, vmf->address); return folio; } next: @@ -6441,41 +6441,39 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr, unsigned int pages_per_huge_page) { int i; - struct page *p; might_sleep(); for (i = 0; i < pages_per_huge_page; i++) { - p = nth_page(page, i); cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { - struct page *page = arg; + struct folio *folio = arg; - clear_user_highpage(nth_page(page, idx), addr); + clear_user_highpage(folio_page(folio, idx), addr); return 0; } -void clear_huge_page(struct page *page, - unsigned long addr_hint, unsigned int pages_per_huge_page) +/** + * folio_zero_user - Zero a folio which will be mapped to userspace. + * @folio: The folio to zero. + * @addr_hint: The address will be accessed or the base address if uncelar. + */ +void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned long addr = addr_hint & - ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + unsigned int nr_pages = folio_nr_pages(folio); - if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, pages_per_huge_page); - return; - } - - process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) + clear_gigantic_page(folio, addr_hint, nr_pages); + else + process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, -- cgit From f00b295b9b61bb332b4f5951f479ab3aaeada5b8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 26 Jun 2024 16:53:27 +0800 Subject: fs: hugetlbfs: support poisoned recover from hugetlbfs_migrate_folio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is similar to __migrate_folio(), use folio_mc_copy() in HugeTLB folio migration to avoid panic when copy from poisoned folio. Link: https://lkml.kernel.org/r/20240626085328.608006-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jane Chu Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Tony Luck Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- mm/migrate.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9456e1d55540..ecad73a4f713 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1128,7 +1128,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, hugetlb_set_folio_subpool(src, NULL); } - folio_migrate_copy(dst, src); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } diff --git a/mm/migrate.c b/mm/migrate.c index 9dd5eb846d38..da1e115cc403 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -550,10 +550,16 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, folio_index(src)); - int expected_count; + int rc, expected_count = folio_expected_refs(mapping, src); + + if (folio_ref_count(src) != expected_count) + return -EAGAIN; + + rc = folio_mc_copy(dst, src); + if (unlikely(rc)) + return rc; xas_lock_irq(&xas); - expected_count = folio_expected_refs(mapping, src); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; -- cgit From e6c0c03245b14d6c205814aee67128257d0bea84 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Jul 2024 15:51:20 +0200 Subject: mm: provide mm_struct and address to huge_ptep_get() On powerpc 8xx huge_ptep_get() will need to know whether the given ptep is a PTE entry or a PMD entry. This cannot be known with the PMD entry itself because there is no easy way to know it from the content of the entry. So huge_ptep_get() will need to know either the size of the page or get the pmd. In order to be consistent with huge_ptep_get_and_clear(), give mm and address to huge_ptep_get(). Link: https://lkml.kernel.org/r/cc00c70dd384298796a4e1b25d6c4eb306d3af85.1719928057.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Oscar Salvador Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Peter Xu Signed-off-by: Andrew Morton --- arch/arm/include/asm/hugetlb-3level.h | 4 ++-- arch/arm64/include/asm/hugetlb.h | 2 +- arch/arm64/mm/hugetlbpage.c | 2 +- arch/riscv/include/asm/hugetlb.h | 2 +- arch/riscv/mm/hugetlbpage.c | 2 +- arch/s390/include/asm/hugetlb.h | 4 ++-- arch/s390/mm/hugetlbpage.c | 4 ++-- fs/hugetlbfs/inode.c | 2 +- fs/proc/task_mmu.c | 10 ++++---- fs/userfaultfd.c | 2 +- include/asm-generic/hugetlb.h | 2 +- include/linux/swapops.h | 4 ++-- mm/damon/vaddr.c | 6 ++--- mm/gup.c | 2 +- mm/hmm.c | 2 +- mm/hugetlb.c | 44 +++++++++++++++++------------------ mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 4 ++-- mm/mincore.c | 2 +- mm/userfaultfd.c | 2 +- 21 files changed, 53 insertions(+), 53 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/arch/arm/include/asm/hugetlb-3level.h b/arch/arm/include/asm/hugetlb-3level.h index a30be5505793..87d48e2d90ad 100644 --- a/arch/arm/include/asm/hugetlb-3level.h +++ b/arch/arm/include/asm/hugetlb-3level.h @@ -13,12 +13,12 @@ /* * If our huge pte is non-zero then mark the valid bit. - * This allows pte_present(huge_ptep_get(ptep)) to return true for non-zero + * This allows pte_present(huge_ptep_get(mm,addr,ptep)) to return true for non-zero * ptes. * (The valid bit is automatically cleared by set_pte_at for PROT_NONE ptes). */ #define __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t retval = *ptep; if (pte_val(retval)) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 3954cbd2ff56..293f880865e8 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -46,7 +46,7 @@ extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET -extern pte_t huge_ptep_get(pte_t *ptep); +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void __init arm64_hugetlb_cma_reserve(void); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 3f09ac73cce3..5f1e2103888b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -127,7 +127,7 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) return contig_ptes; } -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { int ncontig, i; size_t pgsize; diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index b1ce97a9dbfc..faf3624d8057 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -44,7 +44,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET -pte_t huge_ptep_get(pte_t *ptep); +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 0ebd968b33c9..42314f093922 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -3,7 +3,7 @@ #include #ifdef CONFIG_RISCV_ISA_SVNAPOT -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { unsigned long pte_num; int i; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index ce5f4fe8be4d..cf1b5d6fb1a6 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -19,7 +19,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(pte_t *ptep); +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -64,7 +64,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int changed = !pte_same(huge_ptep_get(ptep), pte); + int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 2675aab4acc7..1be481672f4a 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -169,7 +169,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, __set_huge_pte_at(mm, addr, ptep, pte); } -pte_t huge_ptep_get(pte_t *ptep) +pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); } @@ -177,7 +177,7 @@ pte_t huge_ptep_get(pte_t *ptep) pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; pud_t *pudp = (pud_t *) ptep; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ecad73a4f713..a84832bd06c2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -422,7 +422,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, if (!ptep) return false; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (huge_pte_none(pte) || !pte_present(pte)) return false; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 728693ed00e6..775a2e8d600c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1013,7 +1013,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - pte_t ptent = huge_ptep_get(pte); + pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; bool present = false; @@ -1878,7 +1878,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); @@ -2567,7 +2567,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) @@ -2579,7 +2579,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) @@ -2975,7 +2975,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pte_t huge_pte = huge_ptep_get(pte); + pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); struct numa_maps *md; struct page *page; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 17e409ceaa33..27a3e9285fbf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,7 +257,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, goto out; ret = false; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 6dcf4d576970..594d5905f615 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -144,7 +144,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET -static inline pte_t huge_ptep_get(pte_t *ptep) +static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a5c560a2f8c2..cb468e418ea1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -334,7 +334,7 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -359,7 +359,7 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *pte) { } + unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 381559e4a1fa..58829baf8b5d 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -339,7 +339,7 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { bool referenced = false; - pte_t entry = huge_ptep_get(pte); + pte_t entry = huge_ptep_get(mm, addr, pte); struct folio *folio = pfn_folio(pte_pfn(entry)); unsigned long psize = huge_page_size(hstate_vma(vma)); @@ -373,7 +373,7 @@ static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); - entry = huge_ptep_get(pte); + entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) goto out; @@ -509,7 +509,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; ptl = huge_pte_lock(h, walk->mm, pte); - entry = huge_ptep_get(pte); + entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) goto out; diff --git a/mm/gup.c b/mm/gup.c index 85d45ec57f7c..cdb81db7e366 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -604,7 +604,7 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz if (pte_end < end) end = pte_end; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; diff --git a/mm/hmm.c b/mm/hmm.c index 93aebd9cc130..7e0229ae4a5a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -480,7 +480,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); - entry = huge_ptep_get(pte); + entry = huge_ptep_get(walk->mm, addr, pte); i = (start - range->start) >> PAGE_SHIFT; pfn_req_flags = range->hmm_pfns[i]; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f88ff0333945..fd2050934b13 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5287,7 +5287,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } @@ -5395,7 +5395,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { /* @@ -5433,7 +5433,7 @@ again: set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); @@ -5474,7 +5474,7 @@ again: dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); @@ -5584,7 +5584,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, new_addr |= last_addr_mask; continue; } - if (huge_pte_none(huge_ptep_get(src_pte))) + if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { @@ -5657,7 +5657,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; @@ -5906,7 +5906,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(vmf->pte); + pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; @@ -6027,7 +6027,7 @@ retry_avoidcopy: vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && - pte_same(huge_ptep_get(vmf->pte), pte))) + pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -6065,7 +6065,7 @@ retry_avoidcopy: */ spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); - if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) { + if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ @@ -6166,14 +6166,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ -static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); - same = pte_same(huge_ptep_get(ptep), old_pte); + same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte); spin_unlock(ptl); return same; @@ -6234,7 +6234,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6263,7 +6263,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) + if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; @@ -6312,7 +6312,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } @@ -6339,7 +6339,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte)) + if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) @@ -6460,7 +6460,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - vmf.orig_pte = huge_ptep_get(vmf.pte); + vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); if (huge_pte_none_mostly(vmf.orig_pte)) { if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = @@ -6501,7 +6501,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, vmf.pte); + migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | @@ -6534,11 +6534,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) && + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); @@ -6666,7 +6666,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ - if (!huge_pte_none(huge_ptep_get(dst_pte))) { + if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { spin_unlock(ptl); return -EEXIST; } @@ -6802,7 +6802,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * page backing it, then access the page. */ ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) + if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) goto out_release_unlock; if (folio_in_pagecache) @@ -6923,7 +6923,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, address |= last_addr_mask; continue; } - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(mm, address, ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0276cc299b03..16f8651436d5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -835,7 +835,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(ptep); + pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f73acb01ad45..f8703feb68b7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -624,7 +624,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); - entry = huge_ptep_get(pte); + entry = huge_ptep_get(walk->mm, addr, pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; diff --git a/mm/migrate.c b/mm/migrate.c index abb3aa45bed9..ff512c43fecb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -338,14 +338,14 @@ out: * * This function will release the vma lock before returning. */ -void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep) +void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (unlikely(!is_hugetlb_entry_migration(pte))) { spin_unlock(ptl); diff --git a/mm/mincore.c b/mm/mincore.c index e31cf1bde614..d6bd19e520fc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -33,7 +33,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. */ - present = pte && !huge_pte_none_mostly(huge_ptep_get(pte)); + present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte)); for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8dedaec00486..e54e5c8907fa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -587,7 +587,7 @@ retry: } if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { + !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); -- cgit From dffe24e9587607c377d87d6c372653ae44b99ce7 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Wed, 10 Jul 2024 00:19:12 -0500 Subject: hugetlbfs: ensure generic_hugetlb_get_unmapped_area() returns higher address than mmap_min_addr generic_hugetlb_get_unmapped_area() was returning an address less than mmap_min_addr if the mmap argument addr, after alignment, was less than mmap_min_addr, causing mmap to fail. This is because current generic_hugetlb_get_unmapped_area() code does not take into account mmap_min_addr. This patch ensures that generic_hugetlb_get_unmapped_area() always returns an address that is greater than mmap_min_addr. Additionally, similar to generic_get_unmapped_area(), vm_end_gap() checks are included to maintain stack gap. How to reproduce ================ #include #include #include #include #define HUGEPAGE_SIZE (16 * 1024 * 1024) int main() { void *addr = mmap((void *)-1, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); exit(EXIT_FAILURE); } snprintf((char *)addr, HUGEPAGE_SIZE, "Hello, Huge Pages!"); printf("%s\n", (char *)addr); if (munmap(addr, HUGEPAGE_SIZE) == -1) { perror("munmap"); exit(EXIT_FAILURE); } return 0; } Result without fix ================== # cat /proc/meminfo |grep -i HugePages_Free HugePages_Free: 20 # ./test mmap: Permission denied # Result with fix =============== # cat /proc/meminfo |grep -i HugePages_Free HugePages_Free: 20 # ./test Hello, Huge Pages! # Link: https://lkml.kernel.org/r/20240710051912.4681-1-donettom@linux.ibm.com Signed-off-by: Donet Tom Reported-by Pavithra Prakash Acked-by: Kirill A. Shutemov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nicholas Piggin Cc: Ritesh Harjani (IBM) Cc: Tony Battersby Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/hugetlbfs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a84832bd06c2..cc5e7e80d557 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -222,13 +222,13 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct hstate *h = hstate_file(file); const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > TASK_SIZE) + if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) { @@ -239,9 +239,10 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (mmap_end - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) + vma = find_vma_prev(mm, addr, &prev); + if (mmap_end - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } -- cgit