diff options
author | Linus Torvalds <[email protected]> | 2024-10-29 16:19:15 -1000 |
---|---|---|
committer | Linus Torvalds <[email protected]> | 2024-10-29 16:19:15 -1000 |
commit | 9251e3e93cf2892641539c184294838adedae415 (patch) | |
tree | a042e9f94229b4dc7db149b8d001f02d5840d828 | |
parent | d5b2ee0fe863519be5e1c277d22609b048b61a2a (diff) | |
parent | 01626a18230246efdcea322aa8f067e60ffe5ccd (diff) |
Merge tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton:
"21 hotfixes. 13 are cc:stable. 13 are MM and 8 are non-MM.
No particular theme here - mainly singletons, a couple of doubletons.
Please see the changelogs"
* tag 'mm-hotfixes-stable-2024-10-28-21-50' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (21 commits)
mm: avoid unconditional one-tick sleep when swapcache_prepare fails
mseal: update mseal.rst
mm: split critical region in remap_file_pages() and invoke LSMs in between
selftests/mm: fix deadlock for fork after pthread_create with atomic_bool
Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
tools: testing: add expand-only mode VMA test
mm/vma: add expand-only VMA merge mode and optimise do_brk_flags()
resource,kexec: walk_system_ram_res_rev must retain resource flags
nilfs2: fix kernel bug due to missing clearing of checked flag
mm: numa_clear_kernel_node_hotplug: Add NUMA_NO_NODE check for node id
ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
mm: shmem: fix data-race in shmem_getattr()
mm: mark mas allocation in vms_abort_munmap_vmas as __GFP_NOFAIL
x86/traps: move kmsan check after instrumentation_begin
resource: remove dependency on SPARSEMEM from GET_FREE_REGION
mm/mmap: fix race in mmap_region() with ftruncate()
mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves
fork: only invoke khugepaged, ksm hooks if no error
fork: do not invoke uffd on fork if error occurs
...
-rw-r--r-- | Documentation/userspace-api/mseal.rst | 307 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 12 | ||||
-rw-r--r-- | fs/nilfs2/page.c | 1 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 8 | ||||
-rw-r--r-- | fs/userfaultfd.c | 28 | ||||
-rw-r--r-- | include/linux/ksm.h | 10 | ||||
-rw-r--r-- | include/linux/userfaultfd_k.h | 5 | ||||
-rw-r--r-- | kernel/fork.c | 12 | ||||
-rw-r--r-- | kernel/resource.c | 4 | ||||
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/mmap.c | 84 | ||||
-rw-r--r-- | mm/numa_memblks.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/pagewalk.c | 16 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/vma.c | 23 | ||||
-rw-r--r-- | mm/vma.h | 26 | ||||
-rw-r--r-- | tools/testing/selftests/mm/uffd-common.c | 5 | ||||
-rw-r--r-- | tools/testing/selftests/mm/uffd-common.h | 3 | ||||
-rw-r--r-- | tools/testing/selftests/mm/uffd-unit-tests.c | 24 | ||||
-rw-r--r-- | tools/testing/vma/vma.c | 40 |
22 files changed, 386 insertions, 252 deletions
diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst index 4132eec995a3..41102f74c5e2 100644 --- a/Documentation/userspace-api/mseal.rst +++ b/Documentation/userspace-api/mseal.rst @@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime. A similar feature already exists in the XNU kernel with the VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2]. -User API -======== -mseal() ------------ -The mseal() syscall has the following signature: - -``int mseal(void addr, size_t len, unsigned long flags)`` - -**addr/len**: virtual memory address range. - -The address range set by ``addr``/``len`` must meet: - - The start address must be in an allocated VMA. - - The start address must be page aligned. - - The end address (``addr`` + ``len``) must be in an allocated VMA. - - no gap (unallocated memory) between start and end address. - -The ``len`` will be paged aligned implicitly by the kernel. - -**flags**: reserved for future use. - -**return values**: - -- ``0``: Success. - -- ``-EINVAL``: - - Invalid input ``flags``. - - The start address (``addr``) is not page aligned. - - Address range (``addr`` + ``len``) overflow. - -- ``-ENOMEM``: - - The start address (``addr``) is not allocated. - - The end address (``addr`` + ``len``) is not allocated. - - A gap (unallocated memory) between start and end address. - -- ``-EPERM``: - - sealing is supported only on 64-bit CPUs, 32-bit is not supported. - -- For above error cases, users can expect the given memory range is - unmodified, i.e. no partial update. - -- There might be other internal errors/cases not listed here, e.g. - error during merging/splitting VMAs, or the process reaching the max - number of supported VMAs. In those cases, partial updates to the given - memory range could happen. However, those cases should be rare. - -**Blocked operations after sealing**: - Unmapping, moving to another location, and shrinking the size, - via munmap() and mremap(), can leave an empty space, therefore - can be replaced with a VMA with a new set of attributes. - - Moving or expanding a different VMA into the current location, - via mremap(). - - Modifying a VMA via mmap(MAP_FIXED). - - Size expansion, via mremap(), does not appear to pose any - specific risks to sealed VMAs. It is included anyway because - the use case is unclear. In any case, users can rely on - merging to expand a sealed VMA. - - mprotect() and pkey_mprotect(). - - Some destructive madvice() behaviors (e.g. MADV_DONTNEED) - for anonymous memory, when users don't have write permission to the - memory. Those behaviors can alter region contents by discarding pages, - effectively a memset(0) for anonymous memory. - - Kernel will return -EPERM for blocked operations. - - For blocked operations, one can expect the given address is unmodified, - i.e. no partial update. Note, this is different from existing mm - system call behaviors, where partial updates are made till an error is - found and returned to userspace. To give an example: - - Assume following code sequence: - - - ptr = mmap(null, 8192, PROT_NONE); - - munmap(ptr + 4096, 4096); - - ret1 = mprotect(ptr, 8192, PROT_READ); - - mseal(ptr, 4096); - - ret2 = mprotect(ptr, 8192, PROT_NONE); - - ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ. - - ret2 will be -EPERM, the page remains to be PROT_READ. - -**Note**: - -- mseal() only works on 64-bit CPUs, not 32-bit CPU. - -- users can call mseal() multiple times, mseal() on an already sealed memory - is a no-action (not error). - -- munseal() is not supported. - -Use cases: -========== +SYSCALL +======= +mseal syscall signature +----------------------- + ``int mseal(void \* addr, size_t len, unsigned long flags)`` + + **addr**/**len**: virtual memory address range. + The address range set by **addr**/**len** must meet: + - The start address must be in an allocated VMA. + - The start address must be page aligned. + - The end address (**addr** + **len**) must be in an allocated VMA. + - no gap (unallocated memory) between start and end address. + + The ``len`` will be paged aligned implicitly by the kernel. + + **flags**: reserved for future use. + + **Return values**: + - **0**: Success. + - **-EINVAL**: + * Invalid input ``flags``. + * The start address (``addr``) is not page aligned. + * Address range (``addr`` + ``len``) overflow. + - **-ENOMEM**: + * The start address (``addr``) is not allocated. + * The end address (``addr`` + ``len``) is not allocated. + * A gap (unallocated memory) between start and end address. + - **-EPERM**: + * sealing is supported only on 64-bit CPUs, 32-bit is not supported. + + **Note about error return**: + - For above error cases, users can expect the given memory range is + unmodified, i.e. no partial update. + - There might be other internal errors/cases not listed here, e.g. + error during merging/splitting VMAs, or the process reaching the maximum + number of supported VMAs. In those cases, partial updates to the given + memory range could happen. However, those cases should be rare. + + **Architecture support**: + mseal only works on 64-bit CPUs, not 32-bit CPUs. + + **Idempotent**: + users can call mseal multiple times. mseal on an already sealed memory + is a no-action (not error). + + **no munseal** + Once mapping is sealed, it can't be unsealed. The kernel should never + have munseal, this is consistent with other sealing feature, e.g. + F_SEAL_SEAL for file. + +Blocked mm syscall for sealed mapping +------------------------------------- + It might be important to note: **once the mapping is sealed, it will + stay in the process's memory until the process terminates**. + + Example:: + + *ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + rc = mseal(ptr, 4096, 0); + /* munmap will fail */ + rc = munmap(ptr, 4096); + assert(rc < 0); + + Blocked mm syscall: + - munmap + - mmap + - mremap + - mprotect and pkey_mprotect + - some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE, + MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK + + The first set of syscalls to block is munmap, mremap, mmap. They can + either leave an empty space in the address space, therefore allowing + replacement with a new mapping with new set of attributes, or can + overwrite the existing mapping with another mapping. + + mprotect and pkey_mprotect are blocked because they changes the + protection bits (RWX) of the mapping. + + Certain destructive madvise behaviors, specifically MADV_DONTNEED, + MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce + risks when applied to anonymous memory by threads lacking write + permissions. Consequently, these operations are prohibited under such + conditions. The aforementioned behaviors have the potential to modify + region contents by discarding pages, effectively performing a memset(0) + operation on the anonymous memory. + + Kernel will return -EPERM for blocked syscalls. + + When blocked syscall return -EPERM due to sealing, the memory regions may + or may not be changed, depends on the syscall being blocked: + + - munmap: munmap is atomic. If one of VMAs in the given range is + sealed, none of VMAs are updated. + - mprotect, pkey_mprotect, madvise: partial update might happen, e.g. + when mprotect over multiple VMAs, mprotect might update the beginning + VMAs before reaching the sealed VMA and return -EPERM. + - mmap and mremap: undefined behavior. + +Use cases +========= - glibc: The dynamic linker, during loading ELF executables, can apply sealing to - non-writable memory segments. - -- Chrome browser: protect some security sensitive data-structures. + mapping segments. -Notes on which memory to seal: -============================== +- Chrome browser: protect some security sensitive data structures. -It might be important to note that sealing changes the lifetime of a mapping, -i.e. the sealed mapping won’t be unmapped till the process terminates or the -exec system call is invoked. Applications can apply sealing to any virtual -memory region from userspace, but it is crucial to thoroughly analyze the -mapping's lifetime prior to apply the sealing. +When not to use mseal +===================== +Applications can apply sealing to any virtual memory region from userspace, +but it is *crucial to thoroughly analyze the mapping's lifetime* prior to +apply the sealing. This is because the sealed mapping *won’t be unmapped* +until the process terminates or the exec system call is invoked. For example: + - aio/shm + aio/shm can call mmap and munmap on behalf of userspace, e.g. + ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to + the lifetime of the process. If those memories are sealed from userspace, + then munmap will fail, causing leaks in VMA address space during the + lifetime of the process. + + - ptr allocated by malloc (heap) + Don't use mseal on the memory ptr return from malloc(). + malloc() is implemented by allocator, e.g. by glibc. Heap manager might + allocate a ptr from brk or mapping created by mmap. + If an app calls mseal on a ptr returned from malloc(), this can affect + the heap manager's ability to manage the mappings; the outcome is + non-deterministic. + + Example:: + + ptr = malloc(size); + /* don't call mseal on ptr return from malloc. */ + mseal(ptr, size); + /* free will success, allocator can't shrink heap lower than ptr */ + free(ptr); + +mseal doesn't block +=================== +In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's +attributes, such as protection bits (RWX). Sealed mappings doesn't mean the +memory is immutable. -- aio/shm - - aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in - shm.c. The lifetime of those mapping are not tied to the lifetime of the - process. If those memories are sealed from userspace, then munmap() will fail, - causing leaks in VMA address space during the lifetime of the process. - -- Brk (heap) - - Currently, userspace applications can seal parts of the heap by calling - malloc() and mseal(). - let's assume following calls from user space: - - - ptr = malloc(size); - - mprotect(ptr, size, RO); - - mseal(ptr, size); - - free(ptr); - - Technically, before mseal() is added, the user can change the protection of - the heap by calling mprotect(RO). As long as the user changes the protection - back to RW before free(), the memory range can be reused. - - Adding mseal() into the picture, however, the heap is then sealed partially, - the user can still free it, but the memory remains to be RO. If the address - is re-used by the heap manager for another malloc, the process might crash - soon after. Therefore, it is important not to apply sealing to any memory - that might get recycled. - - Furthermore, even if the application never calls the free() for the ptr, - the heap manager may invoke the brk system call to shrink the size of the - heap. In the kernel, the brk-shrink will call munmap(). Consequently, - depending on the location of the ptr, the outcome of brk-shrink is - nondeterministic. - - -Additional notes: -================= As Jann Horn pointed out in [3], there are still a few ways to write -to RO memory, which is, in a way, by design. Those cases are not covered -by mseal(). If applications want to block such cases, sandbox tools (such as -seccomp, LSM, etc) might be considered. +to RO memory, which is, in a way, by design. And those could be blocked +by different security measures. Those cases are: -- Write to read-only memory through /proc/self/mem interface. -- Write to read-only memory through ptrace (such as PTRACE_POKETEXT). -- userfaultfd. + - Write to read-only memory through /proc/self/mem interface (FOLL_FORCE). + - Write to read-only memory through ptrace (such as PTRACE_POKETEXT). + - userfaultfd. The idea that inspired this patch comes from Stephen Röttger’s work in V8 CFI [4]. Chrome browser in ChromeOS will be the first user of this API. -Reference: -========== -[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274 - -[2] https://man.openbsd.org/mimmutable.2 - -[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com - -[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc +Reference +========= +- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274 +- [2] https://man.openbsd.org/mimmutable.2 +- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com +- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d05392db5d0f..2dbadf347b5f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs) int ud_type; u32 imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); ud_type = decode_bug(regs->ip, &imm); if (ud_type == BUG_NONE) return handled; @@ -276,6 +270,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5436eb0424bd..10def4b55995 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio) folio_clear_uptodate(folio); folio_clear_mappedtodisk(folio); + folio_clear_checked(folio); head = folio_buffers(folio); if (head) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 58887456e3c5..06af21982c16 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 68cdd89c97a3..7c0bd0b55f88 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 11690dacd986..ec9c05044d4f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -54,12 +54,11 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) return atomic_long_read(&mm->ksm_zero_pages); } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { + /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - - return 0; + __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) @@ -107,9 +106,8 @@ static inline int ksm_disable(struct mm_struct *mm) return 0; } -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - return 0; } static inline int ksm_execve(struct mm_struct *mm) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9fc6ce15c499..cb40f1a1d081 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -249,6 +249,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); +void dup_userfaultfd_fail(struct list_head *); extern void mremap_userfaultfd_prep(struct vm_area_struct *, struct vm_userfaultfd_ctx *); @@ -351,6 +352,10 @@ static inline void dup_userfaultfd_complete(struct list_head *l) { } +static inline void dup_userfaultfd_fail(struct list_head *l) +{ +} + static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *ctx) { diff --git a/kernel/fork.c b/kernel/fork.c index 89ceb4a68af2..3bf38d260cb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -653,11 +653,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - khugepaged_fork(mm, oldmm); - /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) @@ -760,6 +755,8 @@ loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + ksm_fork(mm, oldmm); + khugepaged_fork(mm, oldmm); } else if (mpnt) { /* * The entire maple tree has already been duplicated. If the @@ -775,7 +772,10 @@ out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); - dup_userfaultfd_complete(&uf); + if (!retval) + dup_userfaultfd_complete(&uf); + else + dup_userfaultfd_fail(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; diff --git a/kernel/resource.c b/kernel/resource.c index b730bd28b422..4101016e8b20 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, rams_size += 16; } - rams[i].start = res.start; - rams[i++].end = res.end; - + rams[i++] = res; start = res.end + 1; } diff --git a/mm/Kconfig b/mm/Kconfig index 4c9f5ea13271..33fa51d608dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1085,7 +1085,6 @@ config HMM_MIRROR depends on MMU config GET_FREE_REGION - depends on SPARSEMEM bool config DEVICE_PRIVATE diff --git a/mm/memory.c b/mm/memory.c index 3ccee51adfbb..bdf77a3ec47b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; + DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; @@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Relax a bit to prevent rapid * repeated page faults. */ + add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); + remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; @@ -4604,8 +4609,11 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) + if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; @@ -4620,8 +4628,11 @@ out_release: folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) + if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; diff --git a/mm/mmap.c b/mm/mmap.c index 9c0fb43064b5..1e0e34cb993f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1418,6 +1418,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vmg.flags = vm_flags; } + /* + * clear PTEs while the vma is still in the tree so that rmap + * cannot race with the freeing later in the truncate scenario. + * This is also needed for call_mmap(), which is why vm_ops + * close function is called. + */ + vms_clean_up_area(&vms, &mas_detach); vma = vma_merge_new_range(&vmg); if (vma) goto expanded; @@ -1439,11 +1446,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (file) { vma->vm_file = get_file(file); - /* - * call_mmap() may map PTE, so ensure there are no existing PTEs - * and call the vm_ops close function if one exists. - */ - vms_clean_up_area(&vms, &mas_detach); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -1640,6 +1642,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; + vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); @@ -1656,12 +1659,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - if (mmap_write_lock_killable(mm)) + if (mmap_read_lock_killable(mm)) + return -EINTR; + + /* + * Look up VMA under read lock first so we can perform the security + * without holding locks (which can be problematic). We reacquire a + * write lock later and check nothing changed underneath us. + */ + vma = vma_lookup(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) { + mmap_read_unlock(mm); + return -EINVAL; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + + /* Save vm_flags used to calculate prot and flags, and recheck later. */ + vm_flags = vma->vm_flags; + file = get_file(vma->vm_file); + + mmap_read_unlock(mm); + + /* Call outside mmap_lock to be consistent with other callers. */ + ret = security_mmap_file(file, prot, flags); + if (ret) { + fput(file); + return ret; + } + + ret = -EINVAL; + + /* OK security check passed, take write lock + let it rip. */ + if (mmap_write_lock_killable(mm)) { + fput(file); return -EINTR; + } vma = vma_lookup(mm, start); - if (!vma || !(vma->vm_flags & VM_SHARED)) + if (!vma) + goto out; + + /* Make sure things didn't change under us. */ + if (vma->vm_flags != vm_flags) + goto out; + if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { @@ -1689,25 +1740,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; } - prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; - prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; - prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; - - flags &= MAP_NONBLOCK; - flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) - flags |= MAP_LOCKED; - - file = get_file(vma->vm_file); - ret = security_mmap_file(vma->vm_file, prot, flags); - if (ret) - goto out_fput; ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); -out_fput: - fput(file); out: mmap_write_unlock(mm); + fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) @@ -1754,7 +1791,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; - vma_iter_next_range(vmi); + /* vmi is positioned at prev, which this mode expects. */ + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; if (vma_merge_new_range(&vmg)) goto out; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index be52b93a9c58..a3877e9bc878 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void) for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) node_set(nid, reserved_nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8afab64814dc..94a2ffe28008 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, page = __rmqueue(zone, order, migratetype, alloc_flags); /* - * If the allocation fails, allow OOM handling access - * to HIGHATOMIC reserves as failing now is worse than - * failing a high-order atomic allocation in the - * future. + * If the allocation fails, allow OOM handling and + * order-0 (atomic) allocs access to HIGHATOMIC + * reserves as failing now is worse than failing a + * high-order atomic allocation in the future. */ - if (!page && (alloc_flags & ALLOC_OOM)) + if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK))) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 461ea3bbd8d9..5f9f01532e67 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw, pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); @@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; + /* + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. + */ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; @@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw, } pmd_table: - VM_WARN_ON_ONCE(pud_leaf(*pudp)); + VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); @@ -786,7 +792,7 @@ pmd_table: if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; - } else if (!pmd_leaf(pmd)) { + } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { @@ -812,7 +818,7 @@ pmd_table: } pte_table: - VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; diff --git a/mm/shmem.c b/mm/shmem.c index c5adb987b23c..4ba1d00fabda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); + inode_lock_shared(inode); generic_fillattr(idmap, request_mask, inode, stat); + inode_unlock_shared(inode); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; @@ -917,6 +917,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) pgoff_t pgoff = vmg->pgoff; pgoff_t pglen = PHYS_PFN(end - start); bool can_merge_left, can_merge_right; + bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); VM_WARN_ON(vmg->vma); @@ -930,7 +931,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) return NULL; can_merge_left = can_vma_merge_left(vmg); - can_merge_right = can_vma_merge_right(vmg, can_merge_left); + can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { @@ -953,7 +954,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; - vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ + /* In expand-only case we are already positioned at prev. */ + if (!just_expand) { + /* Equivalent to going to the previous range. */ + vma_prev(vmg->vmi); + } } /* @@ -967,12 +972,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) } /* If expansion failed, reset state. Allows us to retry merge later. */ - vmg->vma = NULL; - vmg->start = start; - vmg->end = end; - vmg->pgoff = pgoff; - if (vmg->vma == prev) - vma_iter_set(vmg->vmi, start); + if (!just_expand) { + vmg->vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + } return NULL; } @@ -59,6 +59,17 @@ enum vma_merge_state { VMA_MERGE_SUCCESS, }; +enum vma_merge_flags { + VMG_FLAG_DEFAULT = 0, + /* + * If we can expand, simply do so. We know there is nothing to merge to + * the right. Does not reset state upon failure to merge. The VMA + * iterator is assumed to be positioned at the previous VMA, rather than + * at the gap. + */ + VMG_FLAG_JUST_EXPAND = 1 << 0, +}; + /* Represents a VMA merge operation. */ struct vma_merge_struct { struct mm_struct *mm; @@ -75,6 +86,7 @@ struct vma_merge_struct { struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; + enum vma_merge_flags merge_flags; enum vma_merge_state state; }; @@ -99,6 +111,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ + .merge_flags = VMG_FLAG_DEFAULT, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ @@ -118,6 +131,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ + .merge_flags = VMG_FLAG_DEFAULT, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -241,15 +255,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); - if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { - pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", - current->comm, current->pid); - /* Leaving vmas detached and in-tree may hamper recovery */ - reattach_vmas(mas_detach); - } else { - /* Clean up the insertion of the unfortunate gap */ - vms_complete_munmap_vmas(vms, mas_detach); - } + mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); + /* Clean up the insertion of the unfortunate gap */ + vms_complete_munmap_vmas(vms, mas_detach); } int diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 852e7281026e..717539eddf98 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -18,7 +18,7 @@ bool test_uffdio_wp = true; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -pthread_barrier_t ready_for_fork; +atomic_bool ready_for_fork; static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) { @@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg) pollfd[1].fd = pipefd[cpu*2]; pollfd[1].events = POLLIN; - /* Ready for parent thread to fork */ - pthread_barrier_wait(&ready_for_fork); + ready_for_fork = true; for (;;) { ret = poll(pollfd, 2, -1); diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 3e6228d8e0dc..a70ae10b5f62 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -33,6 +33,7 @@ #include <inttypes.h> #include <stdint.h> #include <sys/random.h> +#include <stdatomic.h> #include "../kselftest.h" #include "vm_util.h" @@ -104,7 +105,7 @@ extern bool map_shared; extern bool test_uffdio_wp; extern unsigned long long *count_verify; extern volatile bool test_uffdio_copy_eexist; -extern pthread_barrier_t ready_for_fork; +extern atomic_bool ready_for_fork; extern uffd_test_ops_t anon_uffd_test_ops; extern uffd_test_ops_t shmem_uffd_test_ops; diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index c8a3b1c7edff..a2e71b1636e7 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -241,8 +241,7 @@ static void *fork_event_consumer(void *data) fork_event_args *args = data; struct uffd_msg msg = { 0 }; - /* Ready for parent thread to fork */ - pthread_barrier_wait(&ready_for_fork); + ready_for_fork = true; /* Read until a full msg received */ while (uffd_read_msg(args->parent_uffd, &msg)); @@ -311,12 +310,11 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) /* Prepare a thread to resolve EVENT_FORK */ if (with_event) { - pthread_barrier_init(&ready_for_fork, NULL, 2); + ready_for_fork = false; if (pthread_create(&thread, NULL, fork_event_consumer, &args)) err("pthread_create()"); - /* Wait for child thread to start before forking */ - pthread_barrier_wait(&ready_for_fork); - pthread_barrier_destroy(&ready_for_fork); + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ } child = fork(); @@ -781,7 +779,7 @@ static void uffd_sigbus_test_common(bool wp) char c; struct uffd_args args = { 0 }; - pthread_barrier_init(&ready_for_fork, NULL, 2); + ready_for_fork = false; fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); @@ -798,9 +796,8 @@ static void uffd_sigbus_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - /* Wait for child thread to start before forking */ - pthread_barrier_wait(&ready_for_fork); - pthread_barrier_destroy(&ready_for_fork); + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ pid = fork(); if (pid < 0) @@ -841,7 +838,7 @@ static void uffd_events_test_common(bool wp) char c; struct uffd_args args = { 0 }; - pthread_barrier_init(&ready_for_fork, NULL, 2); + ready_for_fork = false; fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); if (uffd_register(uffd, area_dst, nr_pages * page_size, @@ -852,9 +849,8 @@ static void uffd_events_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - /* Wait for child thread to start before forking */ - pthread_barrier_wait(&ready_for_fork); - pthread_barrier_destroy(&ready_for_fork); + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ pid = fork(); if (pid < 0) diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index c53f220eb6cc..b33b47342d41 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -1522,6 +1522,45 @@ static bool test_copy_vma(void) return true; } +static bool test_expand_only_mode(void) +{ + unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + struct mm_struct mm = {}; + VMA_ITERATOR(vmi, &mm, 0); + struct vm_area_struct *vma_prev, *vma; + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5); + + /* + * Place a VMA prior to the one we're expanding so we assert that we do + * not erroneously try to traverse to the previous VMA even though we + * have, through the use of VMG_FLAG_JUST_EXPAND, indicated we do not + * need to do so. + */ + alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + + /* + * We will be positioned at the prev VMA, but looking to expand to + * 0x9000. + */ + vma_iter_set(&vmi, 0x3000); + vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vmg.prev = vma_prev; + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; + + vma = vma_merge_new_range(&vmg); + ASSERT_NE(vma, NULL); + ASSERT_EQ(vma, vma_prev); + ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); + ASSERT_EQ(vma->vm_start, 0x3000); + ASSERT_EQ(vma->vm_end, 0x9000); + ASSERT_EQ(vma->vm_pgoff, 3); + ASSERT_TRUE(vma_write_started(vma)); + ASSERT_EQ(vma_iter_addr(&vmi), 0x3000); + + cleanup_mm(&mm, &vmi); + return true; +} + int main(void) { int num_tests = 0, num_fail = 0; @@ -1553,6 +1592,7 @@ int main(void) TEST(vmi_prealloc_fail); TEST(merge_extend); TEST(copy_vma); + TEST(expand_only_mode); #undef TEST |