diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/gup.c | 54 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/madvise.c | 17 | ||||
-rw-r--r-- | mm/memory-failure.c | 18 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/page_owner.c | 190 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 76 |
10 files changed, 225 insertions, 166 deletions
@@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + /* + * MADV_POPULATE_(READ|WRITE) wants to handle VMA + * lookups+error reporting differently. + */ + if (gup_flags & FOLL_MADV_POPULATE) { + vma = vma_lookup(mm, start); + if (!vma) { + ret = -ENOMEM; + goto out; + } + if (check_vma_flags(vma, gup_flags)) { + ret = -EINVAL; + goto out; + } + goto retry; + } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, @@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* - * faultin_vma_page_range() - populate (prefault) page tables inside the - * given VMA range readable/writable + * faultin_page_range() - populate (prefault) page tables inside the + * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * - * @vma: target vma + * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * - * Returns either number of processed pages in the vma, or a negative error - * code on error (see __get_user_pages()). + * Returns either number of processed pages in the MM, or a negative error + * code on error (see __get_user_pages()). Note that this function reports + * errors related to VMAs, such as incompatible mappings, as expected by + * MADV_POPULATE_(READ|WRITE). * - * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. If it's released, *@locked will be set to 0. + * The range must be page-aligned. + * + * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ -long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, bool write, int *locked) +long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked) { - struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | + FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; - /* - * We want to report -EINVAL instead of -EFAULT for any permission - * problems or incompatible mappings. - */ - if (check_vma_flags(vma, gup_flags)) - return -EINVAL; - - ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, locked); + ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, + gup_flags); lru_add_drain(); return ret; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9859aa4f7553..89f58c7603b2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23ef240ba48a..31d00eee028f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { diff --git a/mm/internal.h b/mm/internal.h index 7e486f2c502c..07ad2675a88b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); -extern long faultin_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool write, int *locked); +extern long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); @@ -1127,10 +1126,13 @@ enum { FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, + /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ + FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ - FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ + FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, diff --git a/mm/madvise.c b/mm/madvise.c index 44a498c94158..1a073fcc4c0c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma, { const bool write = behavior == MADV_POPULATE_WRITE; struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; *prev = vma; while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; @@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abf..9e62a00b46dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ ret = dissolve_free_huge_page(page); - if (!ret) + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } diff --git a/mm/memory.c b/mm/memory.c index 904f70b99498..d2155ced45f8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5973,6 +5973,10 @@ int follow_phys(struct vm_area_struct *vma, goto out; pte = ptep_get(ptep); + /* Never return PFNs of anon folios in COW mappings. */ + if (vm_normal_folio(vma, address, pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; diff --git a/mm/page_owner.c b/mm/page_owner.c index d17d1351ec84..742f432e5bf0 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -118,7 +118,6 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); @@ -129,6 +128,7 @@ static __init void init_page_owner(void) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { @@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, spin_unlock_irqrestore(&stack_list_lock, flags); } -static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); @@ -217,20 +218,74 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } - refcount_inc(&stack_record->count); + refcount_add(nr_base_pages, &stack_record->count); } -static void dec_stack_record_count(depot_stack_handle_t handle) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); - if (stack_record) - refcount_dec(&stack_record->count); + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); } -void __reset_page_owner(struct page *page, unsigned short order) +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; + page_owner->ts_nsec = ts_nsec; + strscpy(page_owner->comm, comm, + sizeof(page_owner->comm)); + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; + page_ext = page_ext_next(page_ext); + } +} + +void __reset_page_owner(struct page *page, unsigned short order) +{ struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; @@ -245,16 +300,10 @@ void __reset_page_owner(struct page *page, unsigned short order) alloc_handle = page_owner->handle; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); - page_owner = get_page_owner(page_ext); - } + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); page_ext_put(page_ext); + if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those @@ -263,39 +312,14 @@ void __reset_page_owner(struct page *page, unsigned short order) * the machinery is not ready yet, we cannot decrement * their refcount either. */ - dec_stack_record_count(alloc_handle); -} - -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) -{ - struct page_owner *page_owner; - int i; - u64 ts_nsec = local_clock(); - - for (i = 0; i < (1 << order); i++) { - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; - page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, - sizeof(page_owner->comm)); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - - page_ext = page_ext_next(page_ext); - } + dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -303,9 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + current->pid, current->tgid, ts_nsec, + current->comm); page_ext_put(page_ext); - inc_stack_record_count(handle, gfp_mask); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -340,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -356,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + migrate_handle = new_page_owner->handle; + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); + /* + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. + */ + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -787,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); @@ -840,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); + m->private = stack; } else { stack = m->private; - stack = stack->next; } - m->private = stack; - return stack; } @@ -861,11 +891,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) return stack; } -static unsigned long page_owner_stack_threshold; +static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { - int i, stack_count; + int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; @@ -876,14 +906,14 @@ static int stack_print(struct seq_file *m, void *v) nr_entries = stack_record->size; entries = stack_record->entries; - stack_count = refcount_read(&stack_record->count) - 1; + nr_base_pages = refcount_read(&stack_record->count) - 1; - if (stack_count < 1 || stack_count < page_owner_stack_threshold) + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "stack_count: %d\n\n", stack_count); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } @@ -913,13 +943,13 @@ static const struct file_operations page_owner_stack_operations = { static int page_owner_threshold_get(void *data, u64 *val) { - *val = READ_ONCE(page_owner_stack_threshold); + *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { - WRITE_ONCE(page_owner_stack_threshold, val); + WRITE_ONCE(page_owner_pages_threshold, val); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b..94ab99b6b574 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 22aa63f4ef63..68fa001648cc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -989,6 +989,27 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + /* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area * __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) @@ -1025,47 +1046,39 @@ __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) static struct vmap_node * find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) { - struct vmap_node *vn, *va_node = NULL; - struct vmap_area *va_lowest; + unsigned long va_start_lowest; + struct vmap_node *vn; int i; - for (i = 0; i < nr_vmap_nodes; i++) { +repeat: + for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) { vn = &vmap_nodes[i]; spin_lock(&vn->busy.lock); - va_lowest = __find_vmap_area_exceed_addr(addr, &vn->busy.root); - if (va_lowest) { - if (!va_node || va_lowest->va_start < (*va)->va_start) { - if (va_node) - spin_unlock(&va_node->busy.lock); - - *va = va_lowest; - va_node = vn; - continue; - } - } + *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); + + if (*va) + if (!va_start_lowest || (*va)->va_start < va_start_lowest) + va_start_lowest = (*va)->va_start; spin_unlock(&vn->busy.lock); } - return va_node; -} - -static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) -{ - struct rb_node *n = root->rb_node; + /* + * Check if found VA exists, it might have gone away. In this case we + * repeat the search because a VA has been removed concurrently and we + * need to proceed to the next one, which is a rare case. + */ + if (va_start_lowest) { + vn = addr_to_node(va_start_lowest); - addr = (unsigned long)kasan_reset_tag((void *)addr); + spin_lock(&vn->busy.lock); + *va = __find_vmap_area(va_start_lowest, &vn->busy.root); - while (n) { - struct vmap_area *va; + if (*va) + return vn; - va = rb_entry(n, struct vmap_area, rb_node); - if (addr < va->va_start) - n = n->rb_left; - else if (addr >= va->va_end) - n = n->rb_right; - else - return va; + spin_unlock(&vn->busy.lock); + goto repeat; } return NULL; @@ -2343,6 +2356,9 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; int i, j; + if (unlikely(!vmap_initialized)) + return NULL; + /* * An addr_to_node_id(addr) converts an address to a node index * where a VA is located. If VA spans several zones and passed |