diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 32 | ||||
-rw-r--r-- | mm/filemap.c | 25 | ||||
-rw-r--r-- | mm/gup.c | 13 | ||||
-rw-r--r-- | mm/kasan/common.c | 62 | ||||
-rw-r--r-- | mm/kasan/kasan_test.c | 46 | ||||
-rw-r--r-- | mm/memcontrol.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 27 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mseal.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 11 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 292 | ||||
-rw-r--r-- | mm/slub.c | 414 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/userfaultfd.c | 29 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmscan.c | 24 |
19 files changed, 612 insertions, 408 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index afc72fde0f03..41a58536531d 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -70,6 +70,38 @@ config SLUB_DEBUG_ON off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying "slab_debug=-". +config SLUB_RCU_DEBUG + bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)" + depends on SLUB_DEBUG + # SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless + # without KASAN, so mark it as a dependency of KASAN for now. + depends on KASAN + default KASAN_GENERIC || KASAN_SW_TAGS + help + Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache + was not marked as SLAB_TYPESAFE_BY_RCU and every caller used + kfree_rcu() instead. + + This is intended for use in combination with KASAN, to enable KASAN to + detect use-after-free accesses in such caches. + (KFENCE is able to do that independent of this flag.) + + This might degrade performance. + Unfortunately this also prevents a very specific bug pattern from + triggering (insufficient checks against an object being recycled + within the RCU grace period); so this option can be turned off even on + KASAN builds, in case you want to test for such a bug. + + If you're using this for testing bugs / fuzzing and care about + catching all the bugs WAY more than performance, you might want to + also turn on CONFIG_RCU_STRICT_GRACE_PERIOD. + + WARNING: + This is designed as a debugging feature, not a security feature. + Objects are sometimes recycled without RCU delay under memory pressure. + + If unsure, say N. + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT diff --git a/mm/filemap.c b/mm/filemap.c index d62150418b91..60a9cc593e9b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - loff_t pos = iocb->ki_pos; - loff_t end = pos + count - 1; int ret; - if (iocb->ki_flags & IOCB_NOWAIT) { + if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; @@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + return filemap_invalidate_pages(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1, + iocb->ki_flags & IOCB_NOWAIT); +} EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** @@ -3987,7 +3994,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) ssize_t written = 0; do { - struct page *page; struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ @@ -4017,11 +4023,10 @@ retry: } status = a_ops->write_begin(file, mapping, pos, bytes, - &page, &fsdata); + &folio, &fsdata); if (unlikely(status < 0)) break; - folio = page_folio(page); offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; @@ -4033,7 +4038,7 @@ retry: flush_dcache_folio(folio); status = a_ops->write_end(file, mapping, pos, bytes, copied, - page, fsdata); + folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) @@ -4231,7 +4236,7 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, } /* Wait for writeback to complete on all folios and discard. */ - truncate_inode_pages_range(mapping, start, end); + invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); @@ -416,6 +416,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages) EXPORT_SYMBOL(unpin_user_pages); /** + * unpin_user_folio() - release pages of a folio + * @folio: pointer to folio to be released + * @npages: number of pages of same folio + * + * Release npages of the folio + */ +void unpin_user_folio(struct folio *folio, unsigned long npages) +{ + gup_put_folio(folio, npages, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_folio); + +/** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. * @nfolios: number of folios in the @folios array. diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 85e7c6b4575c..ed4873e18c75 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -208,15 +208,12 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool poison_slab_object(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +/* Returns true when freeing the object is not safe. */ +static bool check_slab_allocation(struct kmem_cache *cache, void *object, + unsigned long ip) { - void *tagged_object; - - if (!kasan_arch_is_ready()) - return false; + void *tagged_object = object; - tagged_object = object; object = kasan_reset_tag(object); if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { @@ -224,37 +221,47 @@ static inline bool poison_slab_object(struct kmem_cache *cache, void *object, return true; } - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return false; - if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } + return false; +} + +static inline void poison_slab_object(struct kmem_cache *cache, void *object, + bool init, bool still_accessible) +{ + void *tagged_object = object; + + object = kasan_reset_tag(object); + + /* RCU slabs could be legally used after free within the RCU period. */ + if (unlikely(still_accessible)) + return; + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); if (kasan_stack_collection_enabled()) kasan_save_free_info(cache, tagged_object); +} - return false; +bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, + unsigned long ip) +{ + if (!kasan_arch_is_ready() || is_kfence_address(object)) + return false; + return check_slab_allocation(cache, object, ip); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, + bool still_accessible) { - if (is_kfence_address(object)) + if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - /* - * If the object is buggy, do not let slab put the object onto the - * freelist. The object will thus never be allocated again and its - * metadata will never get released. - */ - if (poison_slab_object(cache, object, ip, init)) - return true; + poison_slab_object(cache, object, init, still_accessible); /* * If the object is put into quarantine, do not let slab put the object @@ -504,11 +511,16 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr)) - return false; + if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + return true; slab = folio_slab(folio); - return !poison_slab_object(slab->slab_cache, ptr, ip, false); + + if (check_slab_allocation(slab->slab_cache, ptr, ip)) + return false; + + poison_slab_object(slab->slab_cache, ptr, false, false); + return true; } void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 7b32be2a3cf0..567d33b493e2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -996,6 +996,51 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void kmem_cache_rcu_uaf(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + *p = 1; + + rcu_read_lock(); + + /* Free the object - this will internally schedule an RCU callback. */ + kmem_cache_free(cache, p); + + /* + * We should still be allowed to access the object at this point because + * the cache is SLAB_TYPESAFE_BY_RCU and we've been in an RCU read-side + * critical section since before the kmem_cache_free(). + */ + READ_ONCE(*p); + + rcu_read_unlock(); + + /* + * Wait for the RCU callback to execute; after this, the object should + * have actually been freed from KASAN's perspective. + */ + rcu_barrier(); + + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); + + kmem_cache_destroy(cache); +} + static void empty_cache_ctor(void *object) { } static void kmem_cache_double_destroy(struct kunit *test) @@ -1937,6 +1982,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_oob), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_rcu_uaf), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f29157288b7d..d563fb515766 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3613,8 +3613,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg1_soft_limit_reset(memcg); #ifdef CONFIG_ZSWAP memcg->zswap_max = PAGE_COUNTER_MAX; - WRITE_ONCE(memcg->zswap_writeback, - !parent || READ_ONCE(parent->zswap_writeback)); + WRITE_ONCE(memcg->zswap_writeback, true); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -5320,7 +5319,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ - return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); + if (!zswap_is_enabled()) + return true; + + for (; memcg; memcg = parent_mem_cgroup(memcg)) + if (!READ_ONCE(memcg->zswap_writeback)) + return false; + + return true; } static u64 zswap_current_read(struct cgroup_subsys_state *css, diff --git a/mm/memory.c b/mm/memory.c index 3c01d68065be..ebfc9768f801 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2632,11 +2632,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* - * Variant of remap_pfn_range that does not call track_pfn_remap. The caller - * must have pre-validated the caching bits of the pgprot_t. - */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; @@ -2689,6 +2685,27 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return 0; } +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); + + if (!error) + return 0; + + /* + * A partial pfn range mapping is dangerous: it does not + * maintain page reference counts, and callers may free + * pages due to the error. So zap it early. + */ + zap_page_range_single(vma, addr, size, NULL); + return error; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 66267c26ca1b..951878ab627a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + const u64 max_phys = PHYSMEM_END; struct range mhp_range; if (need_mapping) { diff --git a/mm/mmap.c b/mm/mmap.c index d0dfc85b209b..6ddb278a5ee8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1229,7 +1229,7 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ - if (file->f_mode & FMODE_UNSIGNED_OFFSET) + if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ diff --git a/mm/mseal.c b/mm/mseal.c index 15bba28acc00..c8787cc6ba55 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -256,7 +256,7 @@ static int apply_mm_seal(unsigned long start, unsigned long end) * * unseal() is not supported. */ -static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { size_t len; int ret = 0; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c565de8f48e9..91ace8ca97e2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1054,6 +1054,13 @@ __always_inline bool free_pages_prepare(struct page *page, reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); + + /* + * The page is isolated and accounted for. + * Mark the codetag as empty to avoid accounting error + * when the page is freed by unpoison_memory(). + */ + clear_page_tag_ref(page); return false; } diff --git a/mm/shmem.c b/mm/shmem.c index 5a77acf6ac6a..b875852df51f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2878,7 +2878,7 @@ static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -2899,23 +2899,22 @@ shmem_write_begin(struct file *file, struct address_space *mapping, if (ret) return ret; - *pagep = folio_file_page(folio, index); - if (PageHWPoison(*pagep)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; return -EIO; } + *foliop = folio; return 0; } static int shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) diff --git a/mm/slab.h b/mm/slab.h index dcdb56b8e7f5..f22fb760b286 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -422,7 +422,9 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ -int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, @@ -443,6 +445,13 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) return (s->flags & SLAB_KMALLOC); } +static inline bool is_kmalloc_normal(struct kmem_cache *s) +{ + if (!is_kmalloc_cache(s)) + return false; + return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); +} + /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 40b582a014b8..61f32420230a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -40,11 +40,6 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; -static LIST_HEAD(slab_caches_to_rcu_destroy); -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); -static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - slab_caches_to_rcu_destroy_workfn); - /* * Set of flags that will prevent slab merging */ @@ -88,6 +83,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM + +static bool kmem_cache_is_duplicate_name(const char *name) +{ + struct kmem_cache *s; + + list_for_each_entry(s, &slab_caches, list) { + if (!strcmp(s->name, name)) + return true; + } + + return false; +} + static int kmem_cache_sanity_check(const char *name, unsigned int size) { if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { @@ -95,6 +103,10 @@ static int kmem_cache_sanity_check(const char *name, unsigned int size) return -EINVAL; } + /* Duplicate names will confuse slabtop, et al */ + WARN(kmem_cache_is_duplicate_name(name), + "kmem_cache of name '%s' already exists\n", name); + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ return 0; } @@ -169,14 +181,15 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, if (ctor) return NULL; - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); flags = kmem_cache_flags(flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + list_for_each_entry_reverse(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; @@ -202,32 +215,29 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int align, - slab_flags_t flags, unsigned int useroffset, - unsigned int usersize, void (*ctor)(void *), - struct kmem_cache *root_cache) + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s; int err; - if (WARN_ON(useroffset + usersize > object_size)) - useroffset = usersize = 0; + if (WARN_ON(args->useroffset + args->usersize > object_size)) + args->useroffset = args->usersize = 0; + + /* If a custom freelist pointer is requested make sure it's sane. */ + err = -EINVAL; + if (args->use_freeptr_offset && + (args->freeptr_offset >= object_size || + !(flags & SLAB_TYPESAFE_BY_RCU) || + !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) + goto out; err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (!s) goto out; - - s->name = name; - s->size = s->object_size = object_size; - s->align = align; - s->ctor = ctor; -#ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; -#endif - - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, object_size, args, flags); if (err) goto out_free_cache; @@ -242,39 +252,24 @@ out: } /** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace + * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. + * @object_size: The size of objects to be created in this cache. + * @args: Additional arguments for the cache creation (see + * &struct kmem_cache_args). + * @flags: See %SLAB_* flags for an explanation of individual @flags. * - * The flags are + * Not to be called directly, use the kmem_cache_create() wrapper with the same + * parameters. * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. */ -struct kmem_cache * -kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s = NULL; const char *cache_name; @@ -296,7 +291,7 @@ kmem_cache_create_usercopy(const char *name, mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(name, size); + err = kmem_cache_sanity_check(name, object_size); if (err) { goto out_unlock; } @@ -317,12 +312,14 @@ kmem_cache_create_usercopy(const char *name, /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || - WARN_ON(!usersize && useroffset) || - WARN_ON(size < usersize || size - usersize < useroffset)) - usersize = useroffset = 0; - - if (!usersize) - s = __kmem_cache_alias(name, size, align, flags, ctor); + WARN_ON(!args->usersize && args->useroffset) || + WARN_ON(object_size < args->usersize || + object_size - args->usersize < args->useroffset)) + args->usersize = args->useroffset = 0; + + if (!args->usersize) + s = __kmem_cache_alias(name, object_size, args->align, flags, + args->ctor); if (s) goto out_unlock; @@ -332,9 +329,8 @@ kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, - calculate_alignment(flags, align, size), - flags, useroffset, usersize, ctor, NULL); + args->align = calculate_alignment(flags, args->align, object_size); + s = create_cache(cache_name, object_size, args, flags); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -356,41 +352,7 @@ out_unlock: } return s; } -EXPORT_SYMBOL(kmem_cache_create_usercopy); - -/** - * kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache * -kmem_cache_create(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, - ctor); -} -EXPORT_SYMBOL(kmem_cache_create); +EXPORT_SYMBOL(__kmem_cache_create_args); static struct kmem_cache *kmem_buckets_cache __ro_after_init; @@ -478,87 +440,25 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, fail: for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) kmem_cache_destroy((*b)[idx]); - kfree(b); + kmem_cache_free(kmem_buckets_cache, b); return NULL; } EXPORT_SYMBOL(kmem_buckets_create); -#ifdef SLAB_SUPPORTS_SYSFS /* * For a given kmem_cache, kmem_cache_destroy() should only be called * once or there will be a use-after-free problem. The actual deletion * and release of the kobject does not need slab_mutex or cpu_hotplug_lock * protection. So they are now done without holding those locks. - * - * Note that there will be a slight delay in the deletion of sysfs files - * if kmem_cache_release() is called indrectly from a work function. */ static void kmem_cache_release(struct kmem_cache *s) { - if (slab_state >= FULL) { - sysfs_slab_unlink(s); + kfence_shutdown_cache(s); + if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) sysfs_slab_release(s); - } else { + else slab_kmem_cache_release(s); - } -} -#else -static void kmem_cache_release(struct kmem_cache *s) -{ - slab_kmem_cache_release(s); -} -#endif - -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) -{ - LIST_HEAD(to_destroy); - struct kmem_cache *s, *s2; - - /* - * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the - * @slab_caches_to_rcu_destroy list. The slab pages are freed - * through RCU and the associated kmem_cache are dereferenced - * while freeing the pages, so the kmem_caches should be freed only - * after the pending RCU operations are finished. As rcu_barrier() - * is a pretty slow operation, we batch all pending destructions - * asynchronously. - */ - mutex_lock(&slab_mutex); - list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); - mutex_unlock(&slab_mutex); - - if (list_empty(&to_destroy)) - return; - - rcu_barrier(); - - list_for_each_entry_safe(s, s2, &to_destroy, list) { - debugfs_slab_release(s); - kfence_shutdown_cache(s); - kmem_cache_release(s); - } -} - -static int shutdown_cache(struct kmem_cache *s) -{ - /* free asan quarantined objects */ - kasan_cache_shutdown(s); - - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; - - list_del(&s->list); - - if (s->flags & SLAB_TYPESAFE_BY_RCU) { - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); - schedule_work(&slab_caches_to_rcu_destroy_work); - } else { - kfence_shutdown_cache(s); - debugfs_slab_release(s); - } - - return 0; } void slab_kmem_cache_release(struct kmem_cache *s) @@ -570,29 +470,63 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err = -EBUSY; - bool rcu_set; + int err; if (unlikely(!s) || !kasan_check_byte(s)) return; + /* in-flight kfree_rcu()'s may include objects from our cache */ + kvfree_rcu_barrier(); + + if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && + (s->flags & SLAB_TYPESAFE_BY_RCU)) { + /* + * Under CONFIG_SLUB_RCU_DEBUG, when objects in a + * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally + * defer their freeing with call_rcu(). + * Wait for such call_rcu() invocations here before actually + * destroying the cache. + * + * It doesn't matter that we haven't looked at the slab refcount + * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so + * the refcount should be 1 here. + */ + rcu_barrier(); + } + cpus_read_lock(); mutex_lock(&slab_mutex); - rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; - s->refcount--; - if (s->refcount) - goto out_unlock; + if (s->refcount) { + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + return; + } + + /* free asan quarantined objects */ + kasan_cache_shutdown(s); - err = shutdown_cache(s); + err = __kmem_cache_shutdown(s); WARN(err, "%s %s: Slab cache still has objects when called from %pS", __func__, s->name, (void *)_RET_IP_); -out_unlock: + + list_del(&s->list); + mutex_unlock(&slab_mutex); cpus_read_unlock(); - if (!err && !rcu_set) - kmem_cache_release(s); + + if (slab_state >= FULL) + sysfs_slab_unlink(s); + debugfs_slab_release(s); + + if (err) + return; + + if (s->flags & SLAB_TYPESAFE_BY_RCU) + rcu_barrier(); + + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -704,9 +638,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, { int err; unsigned int align = ARCH_KMALLOC_MINALIGN; - - s->name = name; - s->size = s->object_size = size; + struct kmem_cache_args kmem_args = {}; /* * kmalloc caches guarantee alignment of at least the largest @@ -715,14 +647,14 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, */ if (flags & SLAB_KMALLOC) align = max(align, 1U << (ffs(size) - 1)); - s->align = calculate_alignment(flags, align, size); + kmem_args.align = calculate_alignment(flags, align, size); #ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; + kmem_args.useroffset = useroffset; + kmem_args.usersize = usersize; #endif - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, size, &kmem_args, flags); if (err) panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd6..21f71cb6cc06 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -466,12 +466,6 @@ static struct workqueue_struct *flushwq; *******************************************************************/ /* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - -/* * Returns freelist pointer (ptr). With hardening, this is obfuscated * with an XOR of the address where the pointer is held and a per-cache * random number. @@ -756,6 +750,50 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, return false; } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + unsigned int kasan_meta_size; + + if (!slub_debug_orig_size(s)) + return; + + /* + * KASAN can save its free meta data inside of the object at offset 0. + * If this meta data size is larger than 'orig_size', it will overlap + * the data redzone in [orig_size+1, object_size]. Thus, we adjust + * 'orig_size' to be as at least as big as KASAN's meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size > orig_size) + orig_size = kasan_meta_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -985,50 +1023,6 @@ static void print_slab_info(const struct slab *slab) &slab->__page_flags); } -/* - * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API - * family will round up the real request size to these fixed ones, so - * there could be an extra area than what is requested. Save the original - * request size in the meta data area, for better debug and sanity check. - */ -static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) -{ - void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; - - if (!slub_debug_orig_size(s)) - return; - - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - *(unsigned int *)p = orig_size; -} - -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) -{ - void *p = kasan_reset_tag(object); - - if (!slub_debug_orig_size(s)) - return s->object_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - return *(unsigned int *)p; -} - void skip_orig_size_check(struct kmem_cache *s, const void *object) { set_orig_size(s, (void *)object, s->object_size); @@ -1894,7 +1888,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - #ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) @@ -2116,6 +2109,10 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, if (!mem_alloc_profiling_enabled()) return; + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + obj_exts = slab_obj_exts(slab); if (!obj_exts) return; @@ -2185,6 +2182,45 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + unsigned long off; + + folio = virt_to_folio(p); + if (!folio_test_slab(folio)) { + return folio_memcg_kmem(folio) || + (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio)) == 0); + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} + #else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, @@ -2198,18 +2234,37 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + return true; +} #endif /* CONFIG_MEMCG */ +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine, or it was returned to KFENCE. + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) { + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2219,7 +2274,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); @@ -2227,6 +2282,35 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) return false; /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack_noalloc(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be * kept together to avoid discrepancies in behavior. @@ -2239,17 +2323,24 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) */ if (unlikely(init)) { int rsize; - unsigned int inuse; + unsigned int inuse, orig_size; inuse = get_info_end(s); + orig_size = get_orig_size(s, x); if (!kasan_has_integrated_init()) - memset(kasan_reset_tag(x), 0, s->object_size); + memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); + /* + * Restore orig_size, otherwize kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); + } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible); } static __fastpath_inline @@ -2263,7 +2354,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2278,7 +2369,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2318,7 +2409,11 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct slab *slab; unsigned int order = oo_order(oo); - folio = (struct folio *)alloc_pages_node(node, flags, order); + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, flags, order); + if (!folio) return NULL; @@ -3416,14 +3511,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nid, gfpflags, &gfpflags); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3921,6 +4017,8 @@ static void *__slab_alloc_node(struct kmem_cache *s, /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -4062,6 +4160,15 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. @@ -4470,7 +4577,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, slab, object, object, 1, addr); } @@ -4479,7 +4586,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -4498,6 +4605,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { @@ -5148,7 +5282,7 @@ static void set_cpu_partial(struct kmem_cache *s) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -5189,7 +5323,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || + (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* @@ -5210,6 +5345,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -5284,65 +5421,6 @@ static int calculate_sizes(struct kmem_cache *s) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s)) - goto error; - } - } - -#ifdef system_has_freelist_aba - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; - } -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - static void list_slab_objects(struct kmem_cache *s, struct slab *slab, const char *text) { @@ -5896,28 +5974,90 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->name = name; + s->size = s->object_size = size; + + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif + + if (!calculate_sizes(args, s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(args, s)) + goto out; + } + } + +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } + + if (!init_kmem_cache_nodes(s)) + goto out; + + if (!alloc_kmem_cache_cpus(s)) + goto out; /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; + if (slab_state <= UP) { + err = 0; + goto out; + } err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; - } + if (err) + goto out; if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return 0; +out: + if (err) + __kmem_cache_release(s); + return err; } #ifdef SLAB_SUPPORTS_SYSFS diff --git a/mm/sparse.c b/mm/sparse.c index 0f018c6f9ec5..dc38539f8560 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section) static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; /* * Sanity checks - do not allow an architecture to pass diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e54e5c8907fa..acc56c75ba99 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -787,27 +787,30 @@ retry: } dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is mapped as THP don't - * override it and just be strict. - */ - if (unlikely(pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } - /* If an huge pmd materialized from under us fail */ - if (unlikely(pmd_trans_huge(*dst_pmd))) { + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || + pmd_devmap(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } - - BUG_ON(pmd_none(*dst_pmd)); - BUG_ON(pmd_trans_huge(*dst_pmd)); + /* + * For shmem mappings, khugepaged is allowed to remove page + * tables under us; pte_offset_map_lock() will deal with that. + */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af2de36549d6..a0df1e2e155a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2191,6 +2191,7 @@ static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, struct vmap_node, purge_work); + unsigned long nr_purged_pages = 0; struct vmap_area *va, *n_va; LIST_HEAD(local_list); @@ -2208,7 +2209,7 @@ static void purge_vmap_node(struct work_struct *work) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); - atomic_long_sub(nr, &vmap_lazy_nr); + nr_purged_pages += nr; vn->nr_purged++; if (is_vn_id_valid(vn_id) && !vn->skip_populate) @@ -2219,6 +2220,8 @@ static void purge_vmap_node(struct work_struct *work) list_add(&va->list, &local_list); } + atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); + reclaim_list_global(&local_list); } @@ -2626,6 +2629,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->dirty_max = 0; bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); + vb->cpu = raw_smp_processor_id(); xa = addr_to_vb_xa(va->va_start); vb_idx = addr_to_vb_idx(va->va_start); @@ -2642,7 +2646,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) * integrity together with list_for_each_rcu from read * side. */ - vb->cpu = raw_smp_processor_id(); vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); spin_lock(&vbq->lock); list_add_tail_rcu(&vb->free_list, &vbq->free); diff --git a/mm/vmscan.c b/mm/vmscan.c index cfa839284b92..bd489c1af228 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1604,25 +1604,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } -#ifdef CONFIG_CMA -/* - * It is waste of effort to scan and reclaim CMA pages if it is not available - * for current allocation context. Kswapd can not be enrolled as it can not - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL - */ -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return !current_is_kswapd() && - gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - folio_migratetype(folio) == MIGRATE_CMA; -} -#else -static bool skip_cma(struct folio *folio, struct scan_control *sc) -{ - return false; -} -#endif - /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -1669,8 +1650,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx || - skip_cma(folio, sc)) { + if (folio_zonenum(folio) > sc->reclaim_idx) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; @@ -4320,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; |