aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Hubbard <[email protected]>2020-01-30 22:12:17 -0800
committerLinus Torvalds <[email protected]>2020-01-31 10:30:37 -0800
commita43e982082c24c2f5c0b139daac9657ac352eed3 (patch)
tree01996bef4be9798b0ca2b467cdfed1a3770eea0b
parentbe9d30458913f70bed1801abb5ee6f7b7d6f4b19 (diff)
mm/gup: factor out duplicate code from four routines
Patch series "mm/gup: prereqs to track dma-pinned pages: FOLL_PIN", v12. Overview: This is a prerequisite to solving the problem of proper interactions between file-backed pages, and [R]DMA activities, as discussed in [1], [2], [3], and in a remarkable number of email threads since about 2017. :) A new internal gup flag, FOLL_PIN is introduced, and thoroughly documented in the last patch's Documentation/vm/pin_user_pages.rst. I believe that this will provide a good starting point for doing the layout lease work that Ira Weiny has been working on. That's because these new wrapper functions provide a clean, constrained, systematically named set of functionality that, again, is required in order to even know if a page is "dma-pinned". In contrast to earlier approaches, the page tracking can be incrementally applied to the kernel call sites that, until now, have been simply calling get_user_pages() ("gup"). In other words, opt-in by changing from this: get_user_pages() (sets FOLL_GET) put_page() to this: pin_user_pages() (sets FOLL_PIN) unpin_user_page() Testing: * I've done some overall kernel testing (LTP, and a few other goodies), and some directed testing to exercise some of the changes. And as you can see, gup_benchmark is enhanced to exercise this. Basically, I've been able to runtime test the core get_user_pages() and pin_user_pages() and related routines, but not so much on several of the call sites--but those are generally just a couple of lines changed, each. Not much of the kernel is actually using this, which on one hand reduces risk quite a lot. But on the other hand, testing coverage is low. So I'd love it if, in particular, the Infiniband and PowerPC folks could do a smoke test of this series for me. Runtime testing for the call sites so far is pretty light: * io_uring: Some directed tests from liburing exercise this, and they pass. * process_vm_access.c: A small directed test passes. * gup_benchmark: the enhanced version hits the new gup.c code, and passes. * infiniband: Ran rdma-core tests: rdma-core/build/bin/run_tests.py * VFIO: compiles (I'm vowing to set up a run time test soon, but it's not ready just yet) * powerpc: it compiles... * drm/via: compiles... * goldfish: compiles... * net/xdp: compiles... * media/v4l2: compiles... [1] Some slow progress on get_user_pages() (Apr 2, 2019): https://lwn.net/Articles/784574/ [2] DMA and get_user_pages() (LPC: Dec 12, 2018): https://lwn.net/Articles/774411/ [3] The trouble with get_user_pages() (Apr 30, 2018): https://lwn.net/Articles/753027/ This patch (of 22): There are four locations in gup.c that have a fair amount of code duplication. This means that changing one requires making the same changes in four places, not to mention reading the same code four times, and wondering if there are subtle differences. Factor out the common code into static functions, thus reducing the overall line count and the code's complexity. Also, take the opportunity to slightly improve the efficiency of the error cases, by doing a mass subtraction of the refcount, surrounded by get_page()/put_page(). Also, further simplify (slightly), by waiting until the the successful end of each routine, to increment *nr. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: John Hubbard <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Reviewed-by: Jérôme Glisse <[email protected]> Reviewed-by: Jan Kara <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Ira Weiny <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Aneesh Kumar K.V <[email protected]> Cc: Alex Williamson <[email protected]> Cc: Björn Töpel <[email protected]> Cc: Daniel Vetter <[email protected]> Cc: Dan Williams <[email protected]> Cc: Hans Verkuil <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Leon Romanovsky <[email protected]> Cc: Mauro Carvalho Chehab <[email protected]> Cc: Mike Rapoport <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
-rw-r--r--mm/gup.c95
1 files changed, 40 insertions, 55 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 6cb71800fb5c..706f85b84f47 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1978,6 +1978,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
+static int record_subpages(struct page *page, unsigned long addr,
+ unsigned long end, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
+ pages[nr++] = page++;
+
+ return nr;
+}
+
+static void put_compound_head(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2007,32 +2030,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- refs = 0;
head = pte_page(pte);
-
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(head, refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2079,28 +2090,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
- refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pmd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2120,28 +2122,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
- refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pud_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2157,28 +2150,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- refs = 0;
+
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pgd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}