diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 206 |
1 files changed, 115 insertions, 91 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f154019e6b84..6da626bfb52e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1489,7 +1489,6 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, set_page_refcounted(p); } - folio_set_order(folio, 0); __folio_clear_head(folio); } @@ -1580,9 +1579,37 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif +static inline void __clear_hugetlb_destructor(struct hstate *h, + struct folio *folio) +{ + lockdep_assert_held(&hugetlb_lock); + + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not + * apply. + * + */ + if (hstate_is_gigantic(h)) + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); + else + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); +} + /* - * Remove hugetlb folio from lists, and update dtor so that the folio appears - * as just a compound page. + * Remove hugetlb folio from lists. + * If vmemmap exists for the folio, update dtor so that the folio appears + * as just a compound page. Otherwise, wait until after allocating vmemmap + * to update dtor. * * A reference is held on the folio, except in the case of demote. * @@ -1613,31 +1640,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * Very subtle - * - * For non-gigantic pages set the destructor to the normal compound - * page dtor. This is needed in case someone takes an additional - * temporary ref to the page, and freeing is delayed until they drop - * their reference. - * - * For gigantic pages set the destructor to the null dtor. This - * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_folio will turn the folio into a - * simple group of pages. After this the destructor does not - * apply. - * - * This handles the case where more than one ref is held when and - * after update_and_free_hugetlb_folio is called. - * - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. + * We can only clear the hugetlb destructor after allocating vmemmap + * pages. Otherwise, someone (memory error handling) may try to write + * to tail struct pages. + */ + if (!folio_test_hugetlb_vmemmap_optimized(folio)) + __clear_hugetlb_destructor(h, folio); + + /* + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. */ if (!demote) folio_ref_unfreeze(folio, 1); - if (hstate_is_gigantic(h)) - folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); - else - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; @@ -1706,6 +1721,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { int i; struct page *subpage; + bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1736,6 +1752,16 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); + /* + * If vmemmap pages were allocated above, then we need to clear the + * hugetlb destructor under the hugetlb lock. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1951,9 +1977,6 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, struct page *p; __folio_clear_reserved(folio); - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1999,6 +2022,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } + __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -2017,8 +2043,6 @@ out_error: p = folio_page(folio, j); __ClearPageReserved(p); } - folio_set_order(folio, 0); - __folio_clear_head(folio); return false; } @@ -5016,7 +5040,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; + struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); @@ -5115,8 +5139,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry); } else { entry = huge_ptep_get(src_pte); - ptepage = pte_page(entry); - get_page(ptepage); + pte_folio = page_folio(pte_page(entry)); + folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case @@ -5128,10 +5152,10 @@ again: * need to be without the pgtable locks since we could * sleep during the process. */ - if (!PageAnon(ptepage)) { - page_dup_file_rmap(ptepage, true); - } else if (page_try_dup_anon_rmap(ptepage, true, - src_vma)) { + if (!folio_test_anon(pte_folio)) { + page_dup_file_rmap(&pte_folio->page, true); + } else if (page_try_dup_anon_rmap(&pte_folio->page, + true, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; @@ -5140,14 +5164,14 @@ again: /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); if (IS_ERR(new_folio)) { - put_page(ptepage); + folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, - page_folio(ptepage), - addr, dst_vma); - put_page(ptepage); + pte_folio, + addr, dst_vma); + folio_put(pte_folio); if (ret) { folio_put(new_folio); break; @@ -5540,7 +5564,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(vma); - struct page *old_page; + struct folio *old_folio; struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; @@ -5571,7 +5595,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - old_page = pte_page(pte); + old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); @@ -5580,17 +5604,17 @@ retry_avoidcopy: * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. */ - if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - if (!PageAnonExclusive(old_page)) - page_move_anon_rmap(old_page, vma); + if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { + if (!PageAnonExclusive(&old_folio->page)) + page_move_anon_rmap(&old_folio->page, vma); if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); delayacct_wpcopy_end(); return 0; } - VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), - old_page); + VM_BUG_ON_PAGE(folio_test_anon(old_folio) && + PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to @@ -5602,10 +5626,10 @@ retry_avoidcopy: * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - page_folio(old_page) != pagecache_folio) + old_folio != pagecache_folio) outside_reserve = 1; - get_page(old_page); + folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will @@ -5627,7 +5651,7 @@ retry_avoidcopy: pgoff_t idx; u32 hash; - put_page(old_page); + folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock @@ -5642,7 +5666,7 @@ retry_avoidcopy: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, old_page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); @@ -5672,7 +5696,7 @@ retry_avoidcopy: goto out_release_all; } - if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) { + if (copy_user_large_folio(new_folio, old_folio, address, vma)) { ret = VM_FAULT_HWPOISON_LARGE; goto out_release_all; } @@ -5694,14 +5718,14 @@ retry_avoidcopy: /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, vma, true); + page_remove_rmap(&old_folio->page, vma, true); hugepage_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, haddr, ptep, newpte); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_folio = page_folio(old_page); + new_folio = old_folio; } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5710,11 +5734,11 @@ out_release_all: * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_folio != page_folio(old_page)) + if (new_folio != old_folio) restore_reserve_on_error(h, vma, haddr, new_folio); folio_put(new_folio); out_release_old: - put_page(old_page); + folio_put(old_folio); spin_lock(ptl); /* Caller expects lock to be held */ @@ -5731,13 +5755,13 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, vma, address); - bool present; - - rcu_read_lock(); - present = page_cache_next_miss(mapping, idx, 1) != idx; - rcu_read_unlock(); + struct folio *folio; - return present; + folio = filemap_get_folio(mapping, idx); + if (IS_ERR(folio)) + return false; + folio_put(folio); + return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -6062,7 +6086,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t ret; u32 hash; pgoff_t idx; - struct page *page = NULL; + struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; @@ -6179,16 +6203,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and * pagecache_folio, so here we need take the former one - * when page != pagecache_folio or !pagecache_folio. + * when folio != pagecache_folio or !pagecache_folio. */ - page = pte_page(entry); - if (page_folio(page) != pagecache_folio) - if (!trylock_page(page)) { + folio = page_folio(pte_page(entry)); + if (folio != pagecache_folio) + if (!folio_trylock(folio)) { need_wait_lock = 1; goto out_ptl; } - get_page(page); + folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { @@ -6204,9 +6228,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page_folio(page) != pagecache_folio) - unlock_page(page); - put_page(page); + if (folio != pagecache_folio) + folio_unlock(folio); + folio_put(folio); out_ptl: spin_unlock(ptl); @@ -6225,7 +6249,7 @@ out_mutex: * here without taking refcount. */ if (need_wait_lock) - wait_on_page_locked(page); + folio_wait_locked(folio); return ret; } @@ -6425,17 +6449,14 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages, - struct vm_area_struct **vmas) +static void record_subpages(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages) { int nr; for (nr = 0; nr < refs; nr++) { if (likely(pages)) pages[nr] = nth_page(page, nr); - if (vmas) - vmas[nr] = vma; } } @@ -6508,9 +6529,9 @@ out_unlock: } long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *locked) + struct page **pages, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *locked) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -6638,7 +6659,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If subpage information not requested, update counters * and skip the same_page loop below. */ - if (!pages && !vmas && !pfn_offset && + if (!pages && !pfn_offset && (vaddr + huge_page_size(h) < vma->vm_end) && (remainder >= pages_per_huge_page(h))) { vaddr += huge_page_size(h); @@ -6653,11 +6674,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - if (pages || vmas) - record_subpages_vmas(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL, - vmas ? vmas + i : NULL); + if (pages) + record_subpages(nth_page(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL); if (pages) { /* @@ -7137,7 +7157,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long saddr; pte_t *spte = NULL; pte_t *pte; - spinlock_t *ptl; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { @@ -7158,7 +7177,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, if (!spte) goto out; - ptl = huge_pte_lock(hstate_vma(vma), mm, spte); + spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); @@ -7166,7 +7185,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } else { put_page(virt_to_page(spte)); } - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); @@ -7254,7 +7273,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + + if (pte) { + pte_t pteval = ptep_get_lockless(pte); + + BUG_ON(pte_present(pteval) && !pte_huge(pteval)); + } return pte; } |