aboutsummaryrefslogtreecommitdiff
path: root/mm/mmap.c
diff options
context:
space:
mode:
authorLinus Torvalds <[email protected]>2024-11-23 09:58:07 -0800
committerLinus Torvalds <[email protected]>2024-11-23 09:58:07 -0800
commit5c00ff742bf5caf85f60e1c73999f99376fb865d (patch)
treefa484e83c27af79f1c0511e7e0673507461c9379 /mm/mmap.c
parent228a1157fb9fec47eb135b51c0202b574e079ebf (diff)
parent2532e6c74a67e65b95f310946e0c0e0a41b3a34b (diff)
Merge tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - The series "zram: optimal post-processing target selection" from Sergey Senozhatsky improves zram's post-processing selection algorithm. This leads to improved memory savings. - Wei Yang has gone to town on the mapletree code, contributing several series which clean up the implementation: - "refine mas_mab_cp()" - "Reduce the space to be cleared for maple_big_node" - "maple_tree: simplify mas_push_node()" - "Following cleanup after introduce mas_wr_store_type()" - "refine storing null" - The series "selftests/mm: hugetlb_fault_after_madv improvements" from David Hildenbrand fixes this selftest for s390. - The series "introduce pte_offset_map_{ro|rw}_nolock()" from Qi Zheng implements some rationaizations and cleanups in the page mapping code. - The series "mm: optimize shadow entries removal" from Shakeel Butt optimizes the file truncation code by speeding up the handling of shadow entries. - The series "Remove PageKsm()" from Matthew Wilcox completes the migration of this flag over to being a folio-based flag. - The series "Unify hugetlb into arch_get_unmapped_area functions" from Oscar Salvador implements a bunch of consolidations and cleanups in the hugetlb code. - The series "Do not shatter hugezeropage on wp-fault" from Dev Jain takes away the wp-fault time practice of turning a huge zero page into small pages. Instead we replace the whole thing with a THP. More consistent cleaner and potentiall saves a large number of pagefaults. - The series "percpu: Add a test case and fix for clang" from Andy Shevchenko enhances and fixes the kernel's built in percpu test code. - The series "mm/mremap: Remove extra vma tree walk" from Liam Howlett optimizes mremap() by avoiding doing things which we didn't need to do. - The series "Improve the tmpfs large folio read performance" from Baolin Wang teaches tmpfs to copy data into userspace at the folio size rather than as individual pages. A 20% speedup was observed. - The series "mm/damon/vaddr: Fix issue in damon_va_evenly_split_region()" fro Zheng Yejian fixes DAMON splitting. - The series "memcg-v1: fully deprecate charge moving" from Shakeel Butt removes the long-deprecated memcgv2 charge moving feature. - The series "fix error handling in mmap_region() and refactor" from Lorenzo Stoakes cleanup up some of the mmap() error handling and addresses some potential performance issues. - The series "x86/module: use large ROX pages for text allocations" from Mike Rapoport teaches x86 to use large pages for read-only-execute module text. - The series "page allocation tag compression" from Suren Baghdasaryan is followon maintenance work for the new page allocation profiling feature. - The series "page->index removals in mm" from Matthew Wilcox remove most references to page->index in mm/. A slow march towards shrinking struct page. - The series "damon/{self,kunit}tests: minor fixups for DAMON debugfs interface tests" from Andrew Paniakin performs maintenance work for DAMON's self testing code. - The series "mm: zswap swap-out of large folios" from Kanchana Sridhar improves zswap's batching of compression and decompression. It is a step along the way towards using Intel IAA hardware acceleration for this zswap operation. - The series "kasan: migrate the last module test to kunit" from Sabyrzhan Tasbolatov completes the migration of the KASAN built-in tests over to the KUnit framework. - The series "implement lightweight guard pages" from Lorenzo Stoakes permits userapace to place fault-generating guard pages within a single VMA, rather than requiring that multiple VMAs be created for this. Improved efficiencies for userspace memory allocators are expected. - The series "memcg: tracepoint for flushing stats" from JP Kobryn uses tracepoints to provide increased visibility into memcg stats flushing activity. - The series "zram: IDLE flag handling fixes" from Sergey Senozhatsky fixes a zram buglet which potentially affected performance. - The series "mm: add more kernel parameters to control mTHP" from MaĆ­ra Canal enhances our ability to control/configuremultisize THP from the kernel boot command line. - The series "kasan: few improvements on kunit tests" from Sabyrzhan Tasbolatov has a couple of fixups for the KASAN KUnit tests. - The series "mm/list_lru: Split list_lru lock into per-cgroup scope" from Kairui Song optimizes list_lru memory utilization when lockdep is enabled. * tag 'mm-stable-2024-11-18-19-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (215 commits) cma: enforce non-zero pageblock_order during cma_init_reserved_mem() mm/kfence: add a new kunit test test_use_after_free_read_nofault() zram: fix NULL pointer in comp_algorithm_show() memcg/hugetlb: add hugeTLB counters to memcg vmstat: call fold_vm_zone_numa_events() before show per zone NUMA event mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount zram: ZRAM_DEF_COMP should depend on ZRAM MAINTAINERS/MEMORY MANAGEMENT: add document files for mm Docs/mm/damon: recommend academic papers to read and/or cite mm: define general function pXd_init() kmemleak: iommu/iova: fix transient kmemleak false positive mm/list_lru: simplify the list_lru walk callback function mm/list_lru: split the lock to per-cgroup scope mm/list_lru: simplify reparenting and initial allocation mm/list_lru: code clean up for reparenting mm/list_lru: don't export list_lru_add mm/list_lru: don't pass unnecessary key parameters kasan: add kunit tests for kmalloc_track_caller, kmalloc_node_track_caller kasan: change kasan_atomics kunit test as KUNIT_CASE_SLOW kasan: use EXPORT_SYMBOL_IF_KUNIT to export symbols ...
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c276
1 files changed, 10 insertions, 266 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 79d541f1502b..386429f7db5a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -577,22 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/*
- * We account for memory if it's a private writeable mapping,
- * not hugepages and VM_NORESERVE wasn't set.
- */
-static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
-{
- /*
- * hugetlb has its own accounting separate from the core VM
- * VM_HUGETLB may not be set yet so we cannot check for that flag.
- */
- if (file && is_file_hugepages(file))
- return false;
-
- return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
-}
-
/**
* unmapped_area() - Find an area between the low_limit and the high_limit with
* the correct alignment and offset, all from @info. Note: current->mm is used
@@ -776,6 +760,8 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
return vm_unmapped_area(&info);
}
@@ -826,6 +812,8 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.start_gap = stack_guard_placement(vm_flags);
+ if (filp && is_file_hugepages(filp))
+ info.align_mask = huge_page_mask_align(filp);
addr = vm_unmapped_area(&info);
/*
@@ -1051,6 +1039,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
@@ -1086,11 +1076,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1104,16 +1090,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1122,7 +1098,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1149,6 +1124,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
+ mmap_assert_write_locked(mm);
+
address &= PAGE_MASK;
if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
@@ -1178,11 +1155,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Lock the VMA before expanding to prevent concurrent page faults */
vma_start_write(vma);
- /*
- * vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_lock in read mode. We need the
- * anon_vma lock to serialize against concurrent expand_stacks.
- */
+ /* We update the anon VMA tree. */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
@@ -1196,16 +1169,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
- /*
- * We only hold a shared mmap_lock lock here, so
- * we need to protect against concurrent vma
- * expansions. anon_vma_lock_write() doesn't
- * help here, as we don't guarantee that all
- * growable vmas in a mm share the same root
- * anon vma. So, we reuse mm->page_table_lock
- * to guard against concurrent vma expansions.
- */
- spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
@@ -1215,7 +1178,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
/* Overwrite old entry in mtree. */
vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
- spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
@@ -1358,224 +1320,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-static unsigned long __mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = NULL;
- pgoff_t pglen = PHYS_PFN(len);
- unsigned long charged = 0;
- struct vma_munmap_struct vms;
- struct ma_state mas_detach;
- struct maple_tree mt_detach;
- unsigned long end = addr + len;
- int error;
- VMA_ITERATOR(vmi, mm, addr);
- VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
-
- vmg.file = file;
- /* Find the first overlapping VMA */
- vma = vma_find(&vmi, end);
- init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
- if (vma) {
- mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_on_stack(mt_detach);
- mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
- /* Prepare to unmap any existing mapping in the area */
- error = vms_gather_munmap_vmas(&vms, &mas_detach);
- if (error)
- goto gather_failed;
-
- vmg.next = vms.next;
- vmg.prev = vms.prev;
- vma = NULL;
- } else {
- vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
- }
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) {
- error = -ENOMEM;
- goto abort_munmap;
- }
-
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = pglen;
- charged -= vms.nr_accounted;
- if (charged) {
- error = security_vm_enough_memory_mm(mm, charged);
- if (error)
- goto abort_munmap;
- }
-
- vms.nr_accounted = 0;
- vm_flags |= VM_ACCOUNT;
- vmg.flags = vm_flags;
- }
-
- /*
- * clear PTEs while the vma is still in the tree so that rmap
- * cannot race with the freeing later in the truncate scenario.
- * This is also needed for mmap_file(), which is why vm_ops
- * close function is called.
- */
- vms_clean_up_area(&vms, &mas_detach);
- vma = vma_merge_new_range(&vmg);
- if (vma)
- goto expanded;
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma_iter_config(&vmi, addr, end);
- vma_set_range(vma, addr, end, pgoff);
- vm_flags_init(vma, vm_flags);
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
-
- if (vma_iter_prealloc(&vmi, vma)) {
- error = -ENOMEM;
- goto free_vma;
- }
-
- if (file) {
- vma->vm_file = get_file(file);
- error = mmap_file(file, vma);
- if (error)
- goto unmap_and_free_file_vma;
-
- /* Drivers cannot alter the address of the VMA. */
- WARN_ON_ONCE(addr != vma->vm_start);
- /*
- * Drivers should not permit writability when previously it was
- * disallowed.
- */
- VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
- !(vm_flags & VM_MAYWRITE) &&
- (vma->vm_flags & VM_MAYWRITE));
-
- vma_iter_config(&vmi, addr, end);
- /*
- * If vm_flags changed after mmap_file(), we should try merge
- * vma again as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
- struct vm_area_struct *merge;
-
- vmg.flags = vma->vm_flags;
- /* If this fails, state is reset ready for a reattempt. */
- merge = vma_merge_new_range(&vmg);
-
- if (merge) {
- /*
- * ->mmap() can change vma->vm_file and fput
- * the original file. So fput the vma->vm_file
- * here or we would add an extra fput for file
- * and cause general protection fault
- * ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto file_expanded;
- }
- vma_iter_config(&vmi, addr, end);
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_iter_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
-#ifdef CONFIG_SPARC64
- /* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(vm_flags));
-#endif
-
- /* Lock the VMA since it is modified after insertion into VMA tree */
- vma_start_write(vma);
- vma_iter_store(&vmi, vma);
- mm->map_count++;
- vma_link_file(vma);
-
- /*
- * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
-file_expanded:
- file = vma->vm_file;
- ksm_add_vma(vma);
-expanded:
- perf_event_mmap(vma);
-
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
-
- vm_stat_account(mm, vm_flags, pglen);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vm_flags_clear(vma, VM_LOCKED_MASK);
- else
- mm->locked_vm += pglen;
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vm_flags_set(vma, VM_SOFTDIRTY);
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_file_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- vma_iter_set(&vmi, vma->vm_end);
- /* Undo any partial mapping done by a device driver. */
- unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
-free_iter_vma:
- vma_iter_free(&vmi);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
-
-abort_munmap:
- vms_abort_munmap_vmas(&vms, &mas_detach);
-gather_failed:
- return error;
-}
-
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)