diff options
Diffstat (limited to 'mm/pagewalk.c')
-rw-r--r-- | mm/pagewalk.c | 262 |
1 files changed, 191 insertions, 71 deletions
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 461ea3bbd8d9..e478777c86e1 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,9 +3,14 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/mmu_context.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <asm/tlbflush.h> + +#include "internal.h" + /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then @@ -29,9 +34,23 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr, int err = 0; for (;;) { - err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); - if (err) - break; + if (ops->install_pte && pte_none(ptep_get(pte))) { + pte_t new_pte; + + err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, + walk); + if (err) + break; + + set_pte_at(walk->mm, addr, pte, new_pte); + /* Non-present before, so for arches that need it. */ + if (!WARN_ON_ONCE(walk->no_vma)) + update_mmu_cache(walk->vma, addr, pte); + } else { + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + } if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; @@ -81,6 +100,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(3); @@ -89,11 +110,14 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { - if (ops->pte_hole) + if (has_install) + err = __pte_alloc(walk->mm, pmd); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } walk->action = ACTION_SUBTREE; @@ -109,18 +133,25 @@ again: if (walk->action == ACTION_AGAIN) goto again; - - /* - * Check this here so we only break down trans_huge - * pages when we _need_ to - */ - if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || - walk->action == ACTION_CONTINUE || - !(ops->pte_entry)) + if (walk->action == ACTION_CONTINUE) continue; + if (!has_handler) { /* No handlers for lower page tables. */ + if (!has_install) + continue; /* Nothing to do. */ + /* + * We are ONLY installing, so avoid unnecessarily + * splitting a present huge page. + */ + if (pmd_present(*pmd) && + (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + continue; + } + if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); + else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + continue; /* Nothing to do. */ err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -140,6 +171,8 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pmd_entry || ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(2); @@ -148,11 +181,14 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { - if (ops->pte_hole) + if (has_install) + err = __pmd_alloc(walk->mm, pud, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } walk->action = ACTION_SUBTREE; @@ -164,14 +200,26 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (walk->action == ACTION_AGAIN) goto again; - - if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || - walk->action == ACTION_CONTINUE || - !(ops->pmd_entry || ops->pte_entry)) + if (walk->action == ACTION_CONTINUE) continue; + if (!has_handler) { /* No handlers for lower page tables. */ + if (!has_install) + continue; /* Nothing to do. */ + /* + * We are ONLY installing, so avoid unnecessarily + * splitting a present huge page. + */ + if (pud_present(*pud) && + (pud_trans_huge(*pud) || pud_devmap(*pud))) + continue; + } + if (walk->vma) split_huge_pud(walk->vma, pud, addr); + else if (pud_leaf(*pud) || !pud_present(*pud)) + continue; /* Nothing to do. */ + if (pud_none(*pud)) goto again; @@ -189,6 +237,8 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(1); @@ -196,18 +246,21 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { - if (ops->pte_hole) + if (has_install) + err = __pud_alloc(walk->mm, p4d, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } - if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) + if (has_handler || has_install) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -222,6 +275,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || + ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; if (walk->pgd) @@ -231,18 +287,21 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { - if (ops->pte_hole) + if (has_install) + err = __p4d_alloc(walk->mm, pgd, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; - continue; + if (!has_install) + continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } - if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) + if (has_handler || has_install) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; @@ -334,6 +393,11 @@ static int __walk_page_range(unsigned long start, unsigned long end, int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; + bool is_hugetlb = is_vm_hugetlb_page(vma); + + /* We do not support hugetlb PTE installation. */ + if (ops->install_pte && is_hugetlb) + return -EINVAL; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); @@ -341,7 +405,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, return err; } - if (is_vm_hugetlb_page(vma)) { + if (is_hugetlb) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else @@ -380,47 +444,14 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, #endif } -/** - * walk_page_range - walk page table with caller specific callbacks - * @mm: mm_struct representing the target process of page table walk - * @start: start address of the virtual address range - * @end: end address of the virtual address range - * @ops: operation to call during the walk - * @private: private data for callbacks' usage - * - * Recursively walk the page table tree of the process represented by @mm - * within the virtual address range [@start, @end). During walking, we can do - * some caller-specific works for each entry, by setting up pmd_entry(), - * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these - * callbacks, the associated entries/pages are just ignored. - * The return values of these callbacks are commonly defined like below: - * - * - 0 : succeeded to handle the current entry, and if you don't reach the - * end address yet, continue to walk. - * - >0 : succeeded to handle the current entry, and return to the caller - * with caller specific value. - * - <0 : failed to handle the current entry, and return to the caller - * with error code. - * - * Before starting to walk page table, some callers want to check whether - * they really want to walk over the current vma, typically by checking - * its vm_flags. walk_page_test() and @ops->test_walk() are used for this - * purpose. - * - * If operations need to be staged before and committed after a vma is walked, - * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), - * since it is intended to handle commit-type operations, can't return any - * errors. - * - * struct mm_walk keeps current values of some common data like vma and pmd, - * which are useful for the access from callbacks. If you want to pass some - * caller-specific data to callbacks, @private should be helpful. +/* + * See the comment for walk_page_range(), this performs the heavy lifting of the + * operation, only sets no restrictions on how the walk proceeds. * - * Locking: - * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, - * because these function traverse vma list and/or access to vma's data. + * We usually restrict the ability to install PTEs, but this functionality is + * available to internal memory management code and provided in mm/internal.h. */ -int walk_page_range(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { @@ -479,6 +510,80 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } +/* + * Determine if the walk operations specified are permitted to be used for a + * page table walk. + * + * This check is performed on all functions which are parameterised by walk + * operations and exposed in include/linux/pagewalk.h. + * + * Internal memory management code can use the walk_page_range_mm() function to + * be able to use all page walking operations. + */ +static bool check_ops_valid(const struct mm_walk_ops *ops) +{ + /* + * The installation of PTEs is solely under the control of memory + * management logic and subject to many subtle locking, security and + * cache considerations so we cannot permit other users to do so, and + * certainly not for exported symbols. + */ + if (ops->install_pte) + return false; + + return true; +} + +/** + * walk_page_range - walk page table with caller specific callbacks + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * Recursively walk the page table tree of the process represented by @mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. + * + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this + * purpose. + * + * If operations need to be staged before and committed after a vma is walked, + * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), + * since it is intended to handle commit-type operations, can't return any + * errors. + * + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @private should be helpful. + * + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, + * because these function traverse vma list and/or access to vma's data. + */ +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + if (!check_ops_valid(ops)) + return -EINVAL; + + return walk_page_range_mm(mm, start, end, ops, private); +} + /** * walk_page_range_novma - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk @@ -494,7 +599,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * walking the kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to - * take other effective approache (mmap lock may be insufficient) to prevent + * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ @@ -513,6 +618,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; /* * 1) For walking the user virtual address space: @@ -556,6 +663,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); @@ -574,6 +683,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); @@ -623,6 +734,9 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, unsigned long start_addr, end_addr; int err = 0; + if (!check_ops_valid(ops)) + return -EINVAL; + lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { @@ -744,7 +858,8 @@ struct folio *folio_walk_start(struct folio_walk *fw, pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); @@ -753,6 +868,10 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; + /* + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. + */ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; @@ -769,12 +888,13 @@ struct folio *folio_walk_start(struct folio_walk *fw, } pmd_table: - VM_WARN_ON_ONCE(pud_leaf(*pudp)); + VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); @@ -786,7 +906,7 @@ pmd_table: if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; - } else if (!pmd_leaf(pmd)) { + } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { @@ -812,7 +932,7 @@ pmd_table: } pte_table: - VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; |