diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 1576 |
1 files changed, 883 insertions, 693 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e070b8593b37..db895230ee7e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> +#include <linux/memory.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -53,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { - return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 1 << order); } #else -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return false; } @@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -251,13 +255,159 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) return subpool_inode(file_inode(vma->vm_file)); } +/* + * hugetlb vma_lock helper routines + */ +static bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_lock(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. + */ + if (!vma || !__vma_shareable_lock(vma)) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) { + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. + */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); + return; + } + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { - struct file_region *nrg = NULL; + struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); @@ -339,7 +489,7 @@ static bool has_same_uncharge_info(struct file_region *rg, static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { - struct file_region *nrg = NULL, *prg = NULL; + struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && @@ -456,14 +606,12 @@ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { - struct list_head allocated_regions; + LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); - INIT_LIST_HEAD(&allocated_regions); - /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. @@ -860,7 +1008,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * - * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. @@ -1007,12 +1155,28 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + /* + * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; } /* @@ -1043,7 +1207,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -1109,17 +1273,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) return false; } -static void enqueue_huge_page(struct hstate *h, struct page *page) +static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - list_move(&page->lru, &h->hugepage_freelists[nid]); + list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetHPageFreed(page); + folio_set_hugetlb_freed(folio); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1182,6 +1346,11 @@ retry_cpuset: return NULL; } +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1198,12 +1367,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && - h->free_huge_pages - h->resv_huge_pages == 0) + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1303,75 +1471,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) nr_nodes--) /* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_page(struct page *page, +static void __destroy_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = folio_page(folio, i); p->mapping = NULL; clear_compound_head(p); if (!demote) set_page_refcounted(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); } -static void destroy_compound_hugetlb_page_for_demote(struct page *page, +static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, true); + __destroy_compound_gigantic_folio(folio, order, true); } #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, +static void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, false); + __destroy_compound_gigantic_folio(folio, order, false); } -static void free_gigantic_page(struct page *page, unsigned int order) +static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ #ifdef CONFIG_CMA - if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + int nid = folio_nid(folio); + + if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) return; #endif - free_contig_range(page_to_pfn(page), 1 << order); + free_contig_range(folio_pfn(folio), 1 << order); } #ifdef CONFIG_CONTIG_ALLOC -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { + struct page *page; unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); #ifdef CONFIG_CMA { - struct page *page; int node; if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } if (!(gfp_mask & __GFP_THISNODE)) { @@ -1382,17 +1551,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, page = cma_alloc(hugetlb_cma[node], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } } } #endif - return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + return page ? page_folio(page) : NULL; } #else /* !CONFIG_CONTIG_ALLOC */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1400,40 +1570,41 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } -static inline void free_gigantic_page(struct page *page, unsigned int order) { } -static inline void destroy_compound_gigantic_page(struct page *page, +static inline void free_gigantic_folio(struct folio *folio, + unsigned int order) { } +static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif /* - * Remove hugetlb page from lists, and update dtor so that page appears + * Remove hugetlb folio from lists, and update dtor so that the folio appears * as just a compound page. * - * A reference is held on the page, except in the case of demote. + * A reference is held on the folio, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_page(struct hstate *h, struct page *page, +static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus, bool demote) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - list_del(&page->lru); + list_del(&folio->lru); - if (HPageFreed(page)) { + if (folio_test_hugetlb_freed(folio)) { h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1452,50 +1623,50 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, * * For gigantic pages set the destructor to the null dtor. This * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_page will turn the compound page - * into a simple group of pages. After this the destructor does not + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not * apply. * * This handles the case where more than one ref is held when and - * after update_and_free_page is called. + * after update_and_free_hugetlb_folio is called. * * In the case of demote we do not ref count the page as it will soon * be turned into a page of smaller size. */ if (!demote) - set_page_refcounted(page); + folio_ref_unfreeze(folio, 1); if (hstate_is_gigantic(h)) - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); else - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_page(struct hstate *h, struct page *page, +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, false); + __remove_hugetlb_folio(h, folio, adjust_surplus, false); } -static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, +static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, true); + __remove_hugetlb_folio(h, folio, adjust_surplus, true); } -static void add_hugetlb_page(struct hstate *h, struct page *page, +static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int zeroed; - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; @@ -1504,17 +1675,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]++; } - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_page_private(page, 0); - SetHPageVmemmapOptimized(page); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_change_private(folio, NULL); + /* + * We have to set hugetlb_vmemmap_optimized again as above + * folio_change_private(folio, NULL) cleared it. + */ + folio_set_hugetlb_vmemmap_optimized(folio); /* - * This page is about to be managed by the hugetlb allocator and + * This folio is about to be managed by the hugetlb allocator and * should have no users. Drop our reference, and check for others * just in case. */ - zeroed = put_page_testzero(page); - if (!zeroed) + zeroed = folio_put_testzero(folio); + if (unlikely(!zeroed)) /* * It is VERY unlikely soneone else has taken a ref on * the page. In this case, we simply return as the @@ -1523,14 +1698,15 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, */ return; - arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + arch_clear_hugepage_flags(&folio->page); + enqueue_hugetlb_folio(h, folio); } static void __update_and_free_page(struct hstate *h, struct page *page) { int i; - struct page *subpage = page; + struct folio *folio = page_folio(page); + struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1539,7 +1715,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ - if (HPageRawHwpUnreliable(page)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; if (hugetlb_vmemmap_restore(h, page)) { @@ -1549,7 +1725,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * page and put the page back on the hugetlb free list and treat * as a surplus page. */ - add_hugetlb_page(h, page, true); + add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } @@ -1558,11 +1734,11 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ - if (unlikely(PageHWPoison(page))) - hugetlb_clear_page_hwpoison(page); + if (unlikely(folio_test_hwpoison(folio))) + hugetlb_clear_page_hwpoison(&folio->page); - for (i = 0; i < pages_per_huge_page(h); - i++, subpage = mem_map_next(subpage, page, i)) { + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1571,19 +1747,19 @@ static void __update_and_free_page(struct hstate *h, struct page *page) /* * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_page. + * need to be given back to CMA in free_gigantic_folio. */ if (hstate_is_gigantic(h) || - hugetlb_cma_page(page, huge_page_order(h))) { - destroy_compound_gigantic_page(page, huge_page_order(h)); - free_gigantic_page(page, huge_page_order(h)); + hugetlb_cma_folio(folio, huge_page_order(h))) { + destroy_compound_gigantic_folio(folio, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } else { __free_pages(page, huge_page_order(h)); } } /* - * As update_and_free_page() can be called under any context, so we cannot + * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. @@ -1612,8 +1788,9 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() * is going to trigger because a previous call to - * remove_hugetlb_page() will set_compound_page_dtor(page, - * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + * remove_hugetlb_folio() will call folio_set_compound_dtor + * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * directly. */ h = size_to_hstate(page_size(page)); @@ -1630,11 +1807,11 @@ static inline void flush_free_hpage_work(struct hstate *h) flush_work(&free_hpage_work); } -static void update_and_free_page(struct hstate *h, struct page *page, +static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { - if (!HPageVmemmapOptimized(page) || !atomic) { - __update_and_free_page(h, page); + if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { + __update_and_free_page(h, &folio->page); return; } @@ -1645,16 +1822,18 @@ static void update_and_free_page(struct hstate *h, struct page *page, * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; + struct folio *folio; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page, false); + folio = page_folio(page); + update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } @@ -1676,21 +1855,22 @@ void free_huge_page(struct page *page) * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - struct hugepage_subpool *spool = hugetlb_page_subpool(page); + struct folio *folio = page_folio(page); + struct hstate *h = folio_hstate(folio); + int nid = folio_nid(folio); + struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(page_mapcount(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); - hugetlb_set_page_subpool(page, NULL); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); - page->mapping = NULL; - restore_reserve = HPageRestoreReserve(page); - ClearHPageRestoreReserve(page); + hugetlb_set_folio_subpool(folio, NULL); + if (folio_test_anon(folio)) + __ClearPageAnonExclusive(&folio->page); + folio->mapping = NULL; + restore_reserve = folio_test_hugetlb_restore_reserve(folio); + folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a @@ -1712,26 +1892,26 @@ void free_huge_page(struct page *page) } spin_lock_irqsave(&hugetlb_lock, flags); - ClearHPageMigratable(page); - hugetlb_cgroup_uncharge_page(hstate_index(h), - pages_per_huge_page(h), page); - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + folio_clear_hugetlb_migratable(folio); + hugetlb_cgroup_uncharge_folio(hstate_index(h), + pages_per_huge_page(h), folio); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); if (restore_reserve) h->resv_huge_pages++; - if (HPageTemporary(page)) { - remove_hugetlb_page(h, page, false); + if (folio_test_hugetlb_temporary(folio)) { + remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - remove_hugetlb_page(h, page, true); + remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } @@ -1746,36 +1926,38 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct hstate *h, struct page *page) +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - hugetlb_vmemmap_optimize(h, page); - INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - hugetlb_set_page_subpool(page, NULL); - set_hugetlb_cgroup(page, NULL); - set_hugetlb_cgroup_rsvd(page, NULL); + hugetlb_vmemmap_optimize(h, &folio->page); + INIT_LIST_HEAD(&folio->lru); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + hugetlb_set_folio_subpool(folio, NULL); + set_hugetlb_cgroup(folio, NULL); + set_hugetlb_cgroup_rsvd(folio, NULL); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { - __prep_new_huge_page(h, page); + __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, - bool demote) +static bool __prep_compound_gigantic_folio(struct folio *folio, + unsigned int order, bool demote) { int i, j; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; + + __folio_clear_reserved(folio); + __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_compound_order(folio, order); + for (i = 0; i < nr_pages; i++) { + p = folio_page(folio, i); - /* we rely on prep_new_huge_page to set the destructor */ - set_compound_order(page, order); - __ClearPageReserved(page); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -1788,7 +1970,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, * on the head page when they need know if put_page() is needed * after get_user_pages(). */ - __ClearPageReserved(p); + if (i != 0) /* head page cleared above */ + __ClearPageReserved(p); /* * Subtle and very unlikely * @@ -1814,39 +1997,42 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_compound_head(p, page); + if (i != 0) + set_compound_head(p, &folio->page); } - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); return true; out_error: - /* undo tail page modifications made above */ - p = page + 1; - for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { - clear_compound_head(p); + /* undo page modifications made above */ + for (j = 0; j < i; j++) { + p = folio_page(folio, j); + if (j != 0) + clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + for (; j < nr_pages; j++) { + p = folio_page(folio, j); __ClearPageReserved(p); - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + } + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); return false; } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool prep_compound_gigantic_folio(struct folio *folio, + unsigned int order) { - return __prep_compound_gigantic_page(page, order, false); + return __prep_compound_gigantic_folio(folio, order, false); } -static bool prep_compound_gigantic_page_for_demote(struct page *page, +static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, unsigned int order) { - return __prep_compound_gigantic_page(page, order, true); + return __prep_compound_gigantic_folio(folio, order, true); } /* @@ -1911,13 +2097,14 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_buddy_huge_page(struct hstate *h, +static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; + bool retry = true; /* * By default we always try hard to allocate the page with @@ -1933,11 +2120,20 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); +retry: page = __alloc_pages(gfp_mask, order, nid, nmask); - if (page) - __count_vm_event(HTLB_BUDDY_PGALLOC); - else - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + /* Freeze head page */ + if (page && !page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this @@ -1955,36 +2151,44 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, if (node_alloc_noretry && !page && alloc_try_hard) node_set(nid, *node_alloc_noretry); - return page; + if (!page) { + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + return NULL; + } + + __count_vm_event(HTLB_BUDDY_PGALLOC); + return page_folio(page); } /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static struct page *alloc_fresh_huge_page(struct hstate *h, +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; bool retry = false; retry: if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else - page = alloc_buddy_huge_page(h, gfp_mask, + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); - if (!page) + if (!folio) return NULL; - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); if (!retry) { retry = true; goto retry; @@ -1992,9 +2196,9 @@ retry: return NULL; } } - prep_new_huge_page(h, page, page_to_nid(page)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - return page; + return folio; } /* @@ -2004,23 +2208,20 @@ retry: static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, - node_alloc_noretry); - if (page) - break; + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + nodes_allowed, node_alloc_noretry); + if (folio) { + free_huge_page(&folio->page); /* free it into the hugepage allocator */ + return 1; + } } - if (!page) - return 0; - - put_page(page); /* free it into the hugepage allocator */ - - return 1; + return 0; } /* @@ -2036,6 +2237,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, { int nr_nodes, node; struct page *page = NULL; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2047,7 +2249,8 @@ static struct page *remove_pool_huge_page(struct hstate *h, !list_empty(&h->hugepage_freelists[node])) { page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - remove_hugetlb_page(h, page, acct_surplus); + folio = page_folio(page); + remove_hugetlb_folio(h, folio, acct_surplus); break; } } @@ -2072,29 +2275,29 @@ static struct page *remove_pool_huge_page(struct hstate *h, int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; + struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (!PageHuge(page)) + if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); - if (!PageHuge(page)) { + if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } - if (!page_count(page)) { - struct page *head = compound_head(page); - struct hstate *h = page_hstate(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) + if (!folio_ref_count(folio)) { + struct hstate *h = folio_hstate(folio); + if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!HPageFreed(head))) { + if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); @@ -2109,24 +2312,24 @@ retry: goto retry; } - remove_hugetlb_page(h, head, false); + remove_hugetlb_folio(h, folio, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* - * Normally update_and_free_page will allocate required vmemmmap - * before freeing the page. update_and_free_page will fail to + * Normally update_and_free_hugtlb_folio will allocate required vmemmmap + * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_restore(h, head); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (!rc) { - update_and_free_page(h, head, false); + update_and_free_hugetlb_folio(h, folio, false); } else { spin_lock_irq(&hugetlb_lock); - add_hugetlb_page(h, head, false); + add_hugetlb_folio(h, folio, false); h->max_huge_pages++; spin_unlock_irq(&hugetlb_lock); } @@ -2175,10 +2378,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask, bool zero_ref) + int nid, nodemask_t *nmask) { - struct page *page = NULL; - bool retry = false; + struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; @@ -2188,9 +2390,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); -retry: - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2202,64 +2403,42 @@ retry: * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - put_page(page); + free_huge_page(&folio->page); return NULL; } - if (zero_ref) { - /* - * Caller requires a page with zero ref count. - * We will drop ref count here. If someone else is holding - * a ref, the page will be freed when they drop it. Abuse - * temporary page flag to accomplish this. - */ - SetHPageTemporary(page); - if (!put_page_testzero(page)) { - /* - * Unexpected inflated ref count on freshly allocated - * huge. Retry once. - */ - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - spin_unlock_irq(&hugetlb_lock); - if (retry) - return NULL; - - retry = true; - goto retry; - } - ClearHPageTemporary(page); - } - h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; + h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; + struct folio *folio; if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; + /* fresh huge pages are frozen */ + folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); - return page; + return &folio->page; } /* @@ -2280,14 +2459,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } @@ -2297,7 +2476,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) { + if (available_huge_pages(h)) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); @@ -2336,7 +2515,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { - struct list_head surplus_list; + LIST_HEAD(surplus_list); struct page *page, *tmp; int ret; long i; @@ -2351,14 +2530,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) } allocated = 0; - INIT_LIST_HEAD(&surplus_list); ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL, true); + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; @@ -2402,7 +2580,7 @@ retry: if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, page_folio(page)); } free: spin_unlock_irq(&hugetlb_lock); @@ -2709,73 +2887,52 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, } /* - * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve + * the old one * @h: struct hstate old page belongs to - * @old_page: Old page to dissolve + * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, - struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, + struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - int nid = page_to_nid(old_page); - bool alloc_retry = false; - struct page *new_page; + int nid = folio_nid(old_folio); + struct folio *new_folio; int ret = 0; /* - * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the page and 'prep' it + * Before dissolving the folio, we need to allocate a new one for the + * pool to remain stable. Here, we allocate the folio and 'prep' it * by doing everything but actually updating counters and adding to * the pool. This simplifies and let us do most of the processing * under the lock. */ -alloc_retry: - new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); - if (!new_page) + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); + if (!new_folio) return -ENOMEM; - /* - * If all goes well, this page will be directly added to the free - * list in the pool. For this the ref count needs to be zero. - * Attempt to drop now, and retry once if needed. It is VERY - * unlikely there is another ref on the page. - * - * If someone else has a reference to the page, it will be freed - * when they drop their ref. Abuse temporary page flag to accomplish - * this. Retry once if there is an inflated ref count. - */ - SetHPageTemporary(new_page); - if (!put_page_testzero(new_page)) { - if (alloc_retry) - return -EBUSY; - - alloc_retry = true; - goto alloc_retry; - } - ClearHPageTemporary(new_page); - - __prep_new_huge_page(h, new_page); + __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); - if (!PageHuge(old_page)) { + if (!folio_test_hugetlb(old_folio)) { /* - * Freed from under us. Drop new_page too. + * Freed from under us. Drop new_folio too. */ goto free_new; - } else if (page_count(old_page)) { + } else if (folio_ref_count(old_folio)) { /* - * Someone has grabbed the page, try to isolate it here. + * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_page, list); + ret = isolate_hugetlb(&old_folio->page, list); spin_lock_irq(&hugetlb_lock); goto free_new; - } else if (!HPageFreed(old_page)) { + } else if (!folio_test_hugetlb_freed(old_folio)) { /* - * Page's refcount is 0 but it has not been enqueued in the + * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ @@ -2784,35 +2941,35 @@ retry: goto retry; } else { /* - * Ok, old_page is still a genuine free hugepage. Remove it from + * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() - * and enqueue_huge_page() for new_page. The counters will remain - * stable since this happens under the lock. + * and enqueue_hugetlb_folio() for new_folio. The counters will + * remain stable since this happens under the lock. */ - remove_hugetlb_page(h, old_page, false); + remove_hugetlb_folio(h, old_folio, false); /* - * Ref count on new page is already zero as it was dropped + * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - enqueue_huge_page(h, new_page); + enqueue_hugetlb_folio(h, new_folio); /* - * Pages have been replaced, we can safely free the old one. + * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page, false); + update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); - /* Page has a zero ref count, but needs a ref to be freed */ - set_page_refcounted(new_page); - update_and_free_page(h, new_page, false); + /* Folio has a zero ref count, but needs a ref to be freed */ + folio_ref_unfreeze(new_folio, 1); + update_and_free_hugetlb_folio(h, new_folio, false); return ret; } @@ -2820,7 +2977,7 @@ free_new: int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; - struct page *head; + struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2829,9 +2986,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); - if (PageHuge(page)) { - head = compound_head(page); - h = page_hstate(head); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; @@ -2846,10 +3002,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && !isolate_hugetlb(head, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) ret = 0; - else if (!page_count(head)) - ret = alloc_and_dissolve_huge_page(h, head, list); + else if (!folio_ref_count(folio)) + ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } @@ -2860,6 +3016,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; + struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; @@ -2928,14 +3085,16 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); /* Fall through */ } + folio = page_folio(page); hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -2965,8 +3124,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); } return page; @@ -3031,17 +3190,18 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); + struct folio *folio = page_folio(page); struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); - WARN_ON(page_count(page) != 1); - if (prep_compound_gigantic_page(page, huge_page_order(h))) { - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* add to the hugepage allocator */ + WARN_ON(folio_ref_count(folio) != 1); + if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { + WARN_ON(folio_test_reserved(folio)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } /* @@ -3063,14 +3223,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (!alloc_bootmem_huge_page(h, nid)) break; } else { - struct page *page; + struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - page = alloc_fresh_huge_page(h, gfp_mask, nid, + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); - if (!page) + if (!folio) break; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(&folio->page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3215,7 +3375,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, goto out; if (PageHighMem(page)) continue; - remove_hugetlb_page(h, page, false); + remove_hugetlb_folio(h, page_folio(page), false); list_add(&page->lru, &page_list); } } @@ -3420,11 +3580,13 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) { int i, nid = page_to_nid(page); struct hstate *target_hstate; + struct folio *folio = page_folio(page); + struct page *subpage; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_page_for_demote(h, page, false); + remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); rc = hugetlb_vmemmap_restore(h, page); @@ -3432,15 +3594,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); set_page_refcounted(page); - add_hugetlb_page(h, page, false); + add_hugetlb_folio(h, page_folio(page), false); return rc; } /* - * Use destroy_compound_hugetlb_page_for_demote for all huge page + * Use destroy_compound_hugetlb_folio_for_demote for all huge page * sizes as it will not ref count pages. */ - destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. @@ -3453,15 +3615,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { + subpage = nth_page(page, i); + folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_page_for_demote(page + i, + prep_compound_gigantic_folio_for_demote(folio, target_hstate->order); else - prep_compound_page(page + i, target_hstate->order); - set_page_private(page + i, 0); - set_page_refcounted(page + i); - prep_new_huge_page(target_hstate, page + i, nid); - put_page(page + i); + prep_compound_page(subpage, target_hstate->order); + set_page_private(subpage, 0); + prep_new_hugetlb_folio(target_hstate, folio, nid); + free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3472,7 +3635,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) * based on pool changes for the demoted page. */ h->max_huge_pages--; - target_hstate->max_huge_pages += pages_per_huge_page(h); + target_hstate->max_huge_pages += + pages_per_huge_page(h) / pages_per_huge_page(target_hstate); return rc; } @@ -3714,7 +3878,7 @@ static ssize_t demote_store(struct kobject *kobj, unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; - int err = 0; + int err; int nid; err = kstrtoul(buf, 10, &nr_demote); @@ -3765,8 +3929,7 @@ HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int nid; - struct hstate *h = kobj_to_hstate(kobj, &nid); + struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); @@ -3779,7 +3942,6 @@ static ssize_t demote_size_store(struct kobject *kobj, struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; - int nid; demote_size = (unsigned long)memparse(buf, NULL); @@ -3791,7 +3953,7 @@ static ssize_t demote_size_store(struct kobject *kobj, return -EINVAL; /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, &nid); + h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; @@ -3845,35 +4007,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; + return retval; } if (h->demote_order) { - if (sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group)) + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } } - return retval; -} - -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } + return 0; } #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3929,7 +4082,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3939,10 +4092,15 @@ static void hugetlb_unregister_node(struct node *node) for_each_hstate(h) { int idx = hstate_index(h); - if (nhs->hstate_kobjs[idx]) { - kobject_put(nhs->hstate_kobjs[idx]); - nhs->hstate_kobjs[idx] = NULL; - } + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); @@ -3954,12 +4112,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -3990,18 +4151,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ @@ -4017,6 +4168,36 @@ static void hugetlb_register_all_nodes(void) { } #endif +#ifdef CONFIG_CMA +static void __init hugetlb_cma_check(void); +#else +static inline __init void hugetlb_cma_check(void) +{ +} +#endif + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4071,7 +4252,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP @@ -4116,7 +4296,7 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_alloc = first_memory_node; h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", - huge_page_size(h)/1024); + huge_page_size(h)/SZ_1K); parsed_hstate = h; } @@ -4131,11 +4311,11 @@ static void __init hugepages_clear_pages_in_node(void) if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, - MAX_NUMNODES * sizeof(unsigned int)); + sizeof(parsed_hstate->max_huge_pages_node)); } } @@ -4330,18 +4510,34 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || node_isset(node, *mpol_allowed)) + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } @@ -4570,6 +4766,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); /* + * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA @@ -4581,16 +4778,38 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + /* + * vma_lock structure for sharable mappings is vma specific. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -4713,7 +4932,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr hugepage_add_new_anon_rmap(new_page, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); } @@ -4721,14 +4939,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - pte_t *src_pte, *dst_pte, entry, dst_entry; + pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; @@ -4742,12 +4959,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, raw_write_seqcount_begin(&src->write_protect_seq); } else { /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. */ - i_mmap_lock_read(mapping); + hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4766,15 +4983,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, /* * If the pagetables are shared don't copy or take references. - * dst_pte == src_pte is the common case of src/dest sharing. * + * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with - * another vma. If dst_pte !none, this implies sharing. - * Check here before taking page table lock, and once again - * after taking the lock below. + * another vma. So page_count of ptep page is checked instead + * to reliably determine whether pte is shared. */ - dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } @@ -4783,13 +4998,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - dst_entry = huge_ptep_get(dst_pte); again: - if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + if (huge_pte_none(entry)) { /* - * Skip if src entry none. Also, skip in the - * unlikely case dst entry !none as this implies - * sharing with another vma. + * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { @@ -4868,7 +5080,7 @@ again: restore_reserve_on_error(h, dst_vma, addr, new); put_page(new); - /* dst_entry won't change as in child */ + /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_page(dst_vma, dst_pte, addr, new); @@ -4900,7 +5112,7 @@ again: raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { - i_mmap_unlock_read(mapping); + hugetlb_vma_unlock_read(src_vma); } return ret; @@ -4959,6 +5171,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); @@ -4990,6 +5203,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } @@ -5006,7 +5220,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - struct mmu_notifier_range range; unsigned long last_addr_mask; bool force_flush = false; @@ -5021,13 +5234,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); - /* - * If sharing possible, alert mmu notifiers of worst case. - */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); - adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); - mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { @@ -5112,7 +5318,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (ref_page) break; } - mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); /* @@ -5137,29 +5342,46 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* mmu notification performed in caller */ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); - /* - * Clear this flag so that x86's huge_pmd_share page_table_shareable - * test will fail on a vma being torn down, and not grab a page table - * on its way out. We're lucky that the flag has such an appropriate - * name, and can in fact be safely cleared here. We could clear it - * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_rwsem. This works - * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_rwsem is held. - */ - vma->vm_flags &= ~VM_MAYSHARE; + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ + /* + * Unlock and free the vma lock before releasing i_mmap_rwsem. + * When the vma_lock is freed, this makes the vma ineligible + * for pmd sharing. And, i_mmap_rwsem is required to set up + * pmd sharing. This is important as page tables for this + * unmapped range will be asynchrously deleted. If the page + * tables are shared, there will be issues when accessed by + * someone else. + */ + __hugetlb_vma_unlock_write_free(vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } else { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + struct mmu_notifier_range range; struct mmu_gather tlb; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -5238,9 +5460,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; - VM_BUG_ON(unshare && (flags & FOLL_WRITE)); - VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); - /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. @@ -5250,8 +5469,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - if (unlikely(unshare)) - return 0; set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -5314,11 +5531,10 @@ retry_avoidcopy: u32 hash; put_page(old_page); - BUG_ON(huge_pte_none(pte)); /* - * Drop hugetlb_fault_mutex and i_mmap_rwsem before - * unmapping. unmapping needs to hold i_mmap_rwsem - * in write mode. Dropping i_mmap_rwsem in read mode + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * @@ -5326,13 +5542,13 @@ retry_avoidcopy: */ idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); unmap_ref_private(mm, vma, old_page, haddr); - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5374,8 +5590,6 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearHPageRestoreReserve(new_page); - /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -5406,19 +5620,6 @@ out_release_old: return ret; } -/* Return the pagecache page at a given address within a VMA */ -static struct page *hugetlbfs_pagecache_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) -{ - struct address_space *mapping; - pgoff_t idx; - - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - - return find_lock_page(mapping, idx); -} - /* * Return whether there is a pagecache page to back given address within VMA. * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. @@ -5439,7 +5640,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { struct folio *folio = page_folio(page); @@ -5476,7 +5677,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, unsigned long addr, unsigned long reason) { - vm_fault_t ret; u32 hash; struct vm_fault vmf = { .vma = vma, @@ -5494,18 +5694,31 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. */ + hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, reason); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, reason); +} - return ret; +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, @@ -5523,6 +5736,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the @@ -5533,28 +5747,46 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * i_size is modified when holding i_mmap_rwsem, so check here - * once for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - -retry: new_page = false; page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); - goto out; + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); } page = alloc_huge_page(vma, haddr, 0); @@ -5571,11 +5803,10 @@ retry: * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -5583,11 +5814,17 @@ retry: new_page = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = huge_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(page, mapping, idx); if (err) { + /* + * err can't be -EEXIST which implies someone + * else consumed the reservation since hugetlb + * fault mutex is held when add a hugetlb page + * to the page cache. So it's safe to call + * restore_reserve_on_error() here. + */ + restore_reserve_on_error(h, vma, haddr, page); put_page(page); - if (err == -EEXIST) - goto retry; goto out; } new_pagecache_page = true; @@ -5615,10 +5852,14 @@ retry: if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); - goto out; + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } @@ -5643,10 +5884,9 @@ retry: if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; - if (anon_rmap) { - ClearHPageRestoreReserve(page); + if (anon_rmap) hugepage_add_new_anon_rmap(page, vma, haddr); - } else + else page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -5676,15 +5916,17 @@ retry: unlock_page(page); out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(ptl); backout_unlocked: - unlock_page(page); - /* restore reserve for newly allocated pages not in page cache */ if (new_page && !new_pagecache_page) restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); put_page(page); goto out; } @@ -5745,40 +5987,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with i_size modifications during truncation. + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless * something has changed. */ - mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); + hugetlb_vma_lock_read(vma); ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { - i_mmap_unlock_read(mapping); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - /* - * Serialize hugepage allocation and instantiation, so that we don't - * get spurious allocation failures if two CPUs race to instantiate - * the same page in the page cache. - */ - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); - goto out_mutex; - } ret = 0; @@ -5808,7 +6051,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); + pagecache_page = find_lock_page(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -5832,8 +6075,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(pagecache_page); put_page(pagecache_page); } + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5876,8 +6119,8 @@ out_ptl: put_page(pagecache_page); } out_mutex: + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -6005,48 +6248,35 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* * Serialization between remove_inode_hugepages() and - * huge_add_to_page_cache() below happens through the + * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = huge_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; page_in_pagecache = true; } - ptl = huge_pte_lockptr(h, dst_mm, dst_pte); - spin_lock(ptl); + ptl = huge_pte_lock(h, dst_mm, dst_pte); - /* - * Recheck the i_size after holding PT lock to make sure not - * to leave any page mapped (as page_mapped()) beyond the end - * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. - */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - ret = -EFAULT; - if (idx >= size) + ret = -EIO; + if (PageHWPoison(page)) goto out_release_unlock; - ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ + ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) { + if (page_in_pagecache) page_dup_file_rmap(page, true); - } else { - ClearHPageRestoreReserve(page); + else hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); - } /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6105,13 +6335,14 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, for (nr = 0; nr < refs; nr++) { if (likely(pages)) - pages[nr] = mem_map_offset(page, nr); + pages[nr] = nth_page(page, nr); if (vmas) vmas[nr] = vma; } } -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, + unsigned int flags, pte_t *pte, bool *unshare) { pte_t pteval = huge_ptep_get(pte); @@ -6123,13 +6354,69 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; if (flags & FOLL_WRITE) return true; - if (gup_must_unshare(flags, pte_page(pteval))) { + if (gup_must_unshare(vma, flags, pte_page(pteval))) { *unshare = true; return true; } return false; } +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & huge_page_mask(h); + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte, entry; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (!pte) + return NULL; + + ptl = huge_pte_lock(h, mm, pte); + entry = huge_ptep_get(pte); + if (pte_present(entry)) { + page = pte_page(entry) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * Note that page may be a sub-page, and with vmemmap + * optimizations the page struct may be read only. + * try_grab_page() will increase the ref count on the + * head page, so this will be OK. + * + * try_grab_page() should always be able to get the page here, + * because we hold the ptl lock and have verified pte_present(). + */ + if (try_grab_page(page, flags)) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(entry)) { + spin_unlock(ptl); + __migration_entry_wait_huge(pte, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -6196,7 +6483,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || - __follow_hugetlb_must_fault(flags, pte, &unshare)) { + __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; @@ -6206,9 +6493,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; else if (unshare) fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; @@ -6269,7 +6559,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) - record_subpages_vmas(mem_map_offset(page, pfn_offset), + record_subpages_vmas(nth_page(page, pfn_offset), vma, refs, likely(pages) ? pages + i : NULL, vmas ? vmas + i : NULL); @@ -6282,8 +6572,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * tables. If the huge page is present, then the tail * pages must also be present. The ptl prevents the * head page and tail pages from being rearranged in - * any way. So this page must be available at this - * point, unless the page refcount overflowed: + * any way. As this is hugetlb, the pages will never + * be p2pdma or not longterm pinable. So this page + * must be available at this point, unless the page + * refcount overflowed: */ if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { @@ -6340,8 +6632,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); - last_addr_mask = hugetlb_mask_last_page(h); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); @@ -6440,6 +6733,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages << h->order; @@ -6465,6 +6759,12 @@ bool hugetlb_reserve_pages(struct inode *inode, } /* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ + hugetlb_vma_lock_alloc(vma); + + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves @@ -6487,12 +6787,11 @@ bool hugetlb_reserve_pages(struct inode *inode, resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6587,6 +6886,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6656,35 +6956,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, /* * match the virtual addresses, permission and the alignment of the * page table page. + * + * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - !range_in_vma(svma, sbase, s_end)) + !range_in_vma(svma, sbase, s_end) || + !svma->vm_private_data) return 0; return saddr; } -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) -{ - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - - /* - * check on proper vm_flags and page table alignment - */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) - return true; - return false; -} - bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif - return vma_shareable(vma, addr); + /* + * check on proper vm_flags and page table alignment + */ + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (!vma->vm_private_data) /* vma lock required for sharing */ + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; } /* @@ -6718,12 +7020,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode if - * sharing is possible. For hugetlbfs, this prevents removal of any page - * table entries associated with the address space. This is important as we - * are setting up sharing based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6737,7 +7037,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - i_mmap_assert_locked(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -6767,6 +7067,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); return pte; } @@ -6777,7 +7078,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user @@ -6790,6 +7091,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -6801,6 +7103,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { @@ -6928,123 +7231,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -struct page * __weak -follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -struct page * __weak -follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, int pdshift) -{ - WARN(1, "hugepd follow called with no support for hugepage directory format\n"); - return NULL; -} - -struct page * __weak -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = pmd_lockptr(mm, pmd); - spin_lock(ptl); - /* - * make sure that the address range covered by this pmd is not - * unmapped from other threads. - */ - if (!pmd_huge(*pmd)) - goto out; - pte = huge_ptep_get((pte_t *)pmd); - if (pte_present(pte)) { - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); - /* - * try_grab_page() should always succeed here, because: a) we - * hold the pmd (ptl) lock, and b) we've just checked that the - * huge pmd (head) page is present in the page tables. The ptl - * prevents the head page and tail pages from being rearranged - * in any way. So this page must be available at this point, - * unless the page refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait_huge((pte_t *)pmd, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); - if (!pud_huge(*pud)) - goto out; - pte = huge_ptep_get((pte_t *)pud); - if (pte_present(pte)) { - page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pud, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) -{ - if (flags & (FOLL_GET | FOLL_PIN)) - return NULL; - - return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); -} - int isolate_hugetlb(struct page *page, struct list_head *list) { int ret = 0; @@ -7063,7 +7249,7 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7073,7 +7259,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; @@ -7082,12 +7268,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags); + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } @@ -7101,15 +7288,15 @@ void putback_active_hugepage(struct page *page) put_page(page); } -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { - struct hstate *h = page_hstate(oldpage); + struct hstate *h = folio_hstate(old_folio); - hugetlb_cgroup_migrate(oldpage, newpage); - set_page_owner_migrate_reason(newpage, reason); + hugetlb_cgroup_migrate(old_folio, new_folio); + set_page_owner_migrate_reason(&new_folio->page, reason); /* - * transfer temporary state of the new huge page. This is + * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. @@ -7118,12 +7305,13 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (HPageTemporary(newpage)) { - int old_nid = page_to_nid(oldpage); - int new_nid = page_to_nid(newpage); + if (folio_test_hugetlb_temporary(new_folio)) { + int old_nid = folio_nid(old_folio); + int new_nid = folio_nid(new_folio); + + folio_set_hugetlb_temporary(old_folio); + folio_clear_hugetlb_temporary(new_folio); - SetHPageTemporary(oldpage); - ClearHPageTemporary(newpage); /* * There is no need to transfer the per-node surplus state @@ -7171,6 +7359,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); @@ -7182,6 +7371,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. @@ -7332,7 +7522,7 @@ void __init hugetlb_cma_reserve(int order) hugetlb_cma_size = 0; } -void __init hugetlb_cma_check(void) +static void __init hugetlb_cma_check(void) { if (!hugetlb_cma_size || cma_reserve_called) return; |