mm/gup: handle hugepd for follow_page()

Hugepd is only used in PowerPC so far on 4K page size kernels where hash
mmu is used.  follow_page_mask() used to leverage hugetlb APIs to access
hugepd entries.  Teach follow_page_mask() itself on hugepd.

With previous refactors on fast-gup gup_huge_pd(), most of the code can be
leveraged.  There's something not needed for follow page, for example,
gup_hugepte() tries to detect pgtable entry change which will never happen
with slow gup (which has the pgtable lock held), but that's not a problem
to check.

Since follow_page() always only fetch one page, set the end to "address +
PAGE_SIZE" should suffice.  We will still do the pgtable walk once for
each hugetlb page by setting ctx->page_mask properly.

One thing worth mentioning is that some level of pgtable's _bad() helper
will report is_hugepd() entries as TRUE on Power8 hash MMUs.  I think it
at least applies to PUD on Power8 with 4K pgsize.  It means feeding a
hugepd entry to pud_bad() will report a false positive.  Let's leave that
for now because it can be arch-specific where I am a bit declined to
touch.  In this patch it's not a problem as long as hugepd is detected
before any bad pgtable entries.

To allow slow gup like follow_*_page() to access hugepd helpers, hugepd
codes are moved to the top.  Besides that, the helper record_subpages()
will be used by either hugepd or fast-gup now.  To avoid "unused function"
warnings we must provide a "#ifdef" for it, unfortunately.

Link: https://lkml.kernel.org/r/20240327152332.950956-13-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Peter Xu 2024-03-27 11:23:31 -04:00 committed by Andrew Morton
parent 4418c522f6
commit a12083d721

269
mm/gup.c
View file

@ -500,6 +500,149 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
}
#ifdef CONFIG_MMU
#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP)
static int record_subpages(struct page *page, unsigned long sz,
unsigned long addr, unsigned long end,
struct page **pages)
{
struct page *start_page;
int nr;
start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
pages[nr] = nth_page(start_page, nr);
return nr;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
unsigned long __boundary = (addr + sz) & ~(sz-1);
return (__boundary - 1 < end - 1) ? __boundary : end;
}
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *page;
struct folio *folio;
pte_t pte;
int refs;
pte_end = (addr + sz) & ~(sz-1);
if (pte_end < end)
end = pte_end;
pte = huge_ptep_get(ptep);
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);
folio = try_grab_folio(page, refs, flags);
if (!folio)
return 0;
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
folio_set_referenced(folio);
return 1;
}
/*
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
* systems on Power, which does not have issue with folio writeback against
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
* even anonymous memory, we need to do extra check as what we do with most
* of the other folios. See writable_file_mapping_allowed() and
* gup_fast_folio_allowed() for more information.
*/
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);
return 1;
}
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned int flags,
struct follow_page_context *ctx)
{
struct page *page;
struct hstate *h;
spinlock_t *ptl;
int nr = 0, ret;
pte_t *ptep;
/* Only hugetlb supports hugepd */
if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
return ERR_PTR(-EFAULT);
h = hstate_vma(vma);
ptep = hugepte_offset(hugepd, addr, pdshift);
ptl = huge_pte_lock(h, vma->vm_mm, ptep);
ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE,
flags, &page, &nr);
spin_unlock(ptl);
if (ret) {
WARN_ON_ONCE(nr != 1);
ctx->page_mask = (1U << huge_page_order(h)) - 1;
return page;
}
return NULL;
}
#else /* CONFIG_ARCH_HAS_HUGEPD */
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
}
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned int flags,
struct follow_page_context *ctx)
{
return NULL;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags, unsigned long address)
{
@ -868,6 +1011,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
address, PMD_SHIFT, flags, ctx);
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
@ -918,6 +1064,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = READ_ONCE(*pudp);
if (!pud_present(pud))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
return follow_hugepd(vma, __hugepd(pud_val(pud)),
address, PUD_SHIFT, flags, ctx);
if (pud_leaf(pud)) {
ptl = pud_lock(mm, pudp);
page = follow_huge_pud(vma, address, pudp, flags, ctx);
@ -941,10 +1090,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
p4dp = p4d_offset(pgdp, address);
p4d = READ_ONCE(*p4dp);
if (!p4d_present(p4d))
return no_page_table(vma, flags, address);
BUILD_BUG_ON(p4d_leaf(p4d));
if (unlikely(p4d_bad(p4d)))
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
address, P4D_SHIFT, flags, ctx);
if (!p4d_present(p4d) || p4d_bad(p4d))
return no_page_table(vma, flags, address);
return follow_pud_mask(vma, address, p4dp, flags, ctx);
@ -994,10 +1146,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags, address);
if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
address, PGDIR_SHIFT, flags, ctx);
else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
page = no_page_table(vma, flags, address);
else
page = follow_p4d_mask(vma, address, pgd, flags, ctx);
return follow_p4d_mask(vma, address, pgd, flags, ctx);
return page;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@ -2954,106 +3111,6 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
static int record_subpages(struct page *page, unsigned long sz,
unsigned long addr, unsigned long end,
struct page **pages)
{
struct page *start_page;
int nr;
start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
pages[nr] = nth_page(start_page, nr);
return nr;
}
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
unsigned long __boundary = (addr + sz) & ~(sz-1);
return (__boundary - 1 < end - 1) ? __boundary : end;
}
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *page;
struct folio *folio;
pte_t pte;
int refs;
pte_end = (addr + sz) & ~(sz-1);
if (pte_end < end)
end = pte_end;
pte = huge_ptep_get(ptep);
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);
folio = try_grab_folio(page, refs, flags);
if (!folio)
return 0;
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
return 0;
}
if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
folio_set_referenced(folio);
return 1;
}
/*
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
* systems on Power, which does not have issue with folio writeback against
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
* even anonymous memory, we need to do extra check as what we do with most
* of the other folios. See writable_file_mapping_allowed() and
* gup_fast_folio_allowed() for more information.
*/
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);
return 1;
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)