aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug32
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c13
-rw-r--r--mm/kasan/common.c62
-rw-r--r--mm/kasan/kasan_test.c46
-rw-r--r--mm/memcontrol.c12
-rw-r--r--mm/memory.c27
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mseal.c2
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c292
-rw-r--r--mm/slub.c414
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/userfaultfd.c29
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c24
19 files changed, 612 insertions, 408 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afc72fde0f03..41a58536531d 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -70,6 +70,38 @@ config SLUB_DEBUG_ON
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
"slab_debug=-".
+config SLUB_RCU_DEBUG
+ bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)"
+ depends on SLUB_DEBUG
+ # SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless
+ # without KASAN, so mark it as a dependency of KASAN for now.
+ depends on KASAN
+ default KASAN_GENERIC || KASAN_SW_TAGS
+ help
+ Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache
+ was not marked as SLAB_TYPESAFE_BY_RCU and every caller used
+ kfree_rcu() instead.
+
+ This is intended for use in combination with KASAN, to enable KASAN to
+ detect use-after-free accesses in such caches.
+ (KFENCE is able to do that independent of this flag.)
+
+ This might degrade performance.
+ Unfortunately this also prevents a very specific bug pattern from
+ triggering (insufficient checks against an object being recycled
+ within the RCU grace period); so this option can be turned off even on
+ KASAN builds, in case you want to test for such a bug.
+
+ If you're using this for testing bugs / fuzzing and care about
+ catching all the bugs WAY more than performance, you might want to
+ also turn on CONFIG_RCU_STRICT_GRACE_PERIOD.
+
+ WARNING:
+ This is designed as a debugging feature, not a security feature.
+ Objects are sometimes recycled without RCU delay under memory pressure.
+
+ If unsure, say N.
+
config PAGE_OWNER
bool "Track page owner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
diff --git a/mm/filemap.c b/mm/filemap.c
index d62150418b91..60a9cc593e9b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
-int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+int filemap_invalidate_pages(struct address_space *mapping,
+ loff_t pos, loff_t end, bool nowait)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- loff_t pos = iocb->ki_pos;
- loff_t end = pos + count - 1;
int ret;
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (nowait) {
/* we could block if there are any pages in the range */
if (filemap_range_has_page(mapping, pos, end))
return -EAGAIN;
@@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
+
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ return filemap_invalidate_pages(mapping, iocb->ki_pos,
+ iocb->ki_pos + count - 1,
+ iocb->ki_flags & IOCB_NOWAIT);
+}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/**
@@ -3987,7 +3994,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
ssize_t written = 0;
do {
- struct page *page;
struct folio *folio;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
@@ -4017,11 +4023,10 @@ retry:
}
status = a_ops->write_begin(file, mapping, pos, bytes,
- &page, &fsdata);
+ &folio, &fsdata);
if (unlikely(status < 0))
break;
- folio = page_folio(page);
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
bytes = folio_size(folio) - offset;
@@ -4033,7 +4038,7 @@ retry:
flush_dcache_folio(folio);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
- page, fsdata);
+ folio, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
if (unlikely(status < 0))
@@ -4231,7 +4236,7 @@ int filemap_invalidate_inode(struct inode *inode, bool flush,
}
/* Wait for writeback to complete on all folios and discard. */
- truncate_inode_pages_range(mapping, start, end);
+ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
unlock:
filemap_invalidate_unlock(mapping);
diff --git a/mm/gup.c b/mm/gup.c
index 54d0dc3831fb..02c46ae33028 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -416,6 +416,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
EXPORT_SYMBOL(unpin_user_pages);
/**
+ * unpin_user_folio() - release pages of a folio
+ * @folio: pointer to folio to be released
+ * @npages: number of pages of same folio
+ *
+ * Release npages of the folio
+ */
+void unpin_user_folio(struct folio *folio, unsigned long npages)
+{
+ gup_put_folio(folio, npages, FOLL_PIN);
+}
+EXPORT_SYMBOL(unpin_user_folio);
+
+/**
* unpin_folios() - release an array of gup-pinned folios.
* @folios: array of folios to be marked dirty and released.
* @nfolios: number of folios in the @folios array.
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 85e7c6b4575c..ed4873e18c75 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -208,15 +208,12 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
return (void *)object;
}
-static inline bool poison_slab_object(struct kmem_cache *cache, void *object,
- unsigned long ip, bool init)
+/* Returns true when freeing the object is not safe. */
+static bool check_slab_allocation(struct kmem_cache *cache, void *object,
+ unsigned long ip)
{
- void *tagged_object;
-
- if (!kasan_arch_is_ready())
- return false;
+ void *tagged_object = object;
- tagged_object = object;
object = kasan_reset_tag(object);
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) {
@@ -224,37 +221,47 @@ static inline bool poison_slab_object(struct kmem_cache *cache, void *object,
return true;
}
- /* RCU slabs could be legally used after free within the RCU period. */
- if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return false;
-
if (!kasan_byte_accessible(tagged_object)) {
kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
+ return false;
+}
+
+static inline void poison_slab_object(struct kmem_cache *cache, void *object,
+ bool init, bool still_accessible)
+{
+ void *tagged_object = object;
+
+ object = kasan_reset_tag(object);
+
+ /* RCU slabs could be legally used after free within the RCU period. */
+ if (unlikely(still_accessible))
+ return;
+
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
KASAN_SLAB_FREE, init);
if (kasan_stack_collection_enabled())
kasan_save_free_info(cache, tagged_object);
+}
- return false;
+bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object,
+ unsigned long ip)
+{
+ if (!kasan_arch_is_ready() || is_kfence_address(object))
+ return false;
+ return check_slab_allocation(cache, object, ip);
}
-bool __kasan_slab_free(struct kmem_cache *cache, void *object,
- unsigned long ip, bool init)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
+ bool still_accessible)
{
- if (is_kfence_address(object))
+ if (!kasan_arch_is_ready() || is_kfence_address(object))
return false;
- /*
- * If the object is buggy, do not let slab put the object onto the
- * freelist. The object will thus never be allocated again and its
- * metadata will never get released.
- */
- if (poison_slab_object(cache, object, ip, init))
- return true;
+ poison_slab_object(cache, object, init, still_accessible);
/*
* If the object is put into quarantine, do not let slab put the object
@@ -504,11 +511,16 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
return true;
}
- if (is_kfence_address(ptr))
- return false;
+ if (is_kfence_address(ptr) || !kasan_arch_is_ready())
+ return true;
slab = folio_slab(folio);
- return !poison_slab_object(slab->slab_cache, ptr, ip, false);
+
+ if (check_slab_allocation(slab->slab_cache, ptr, ip))
+ return false;
+
+ poison_slab_object(slab->slab_cache, ptr, false, false);
+ return true;
}
void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip)
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 7b32be2a3cf0..567d33b493e2 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -996,6 +996,51 @@ static void kmem_cache_invalid_free(struct kunit *test)
kmem_cache_destroy(cache);
}
+static void kmem_cache_rcu_uaf(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB_RCU_DEBUG);
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+ *p = 1;
+
+ rcu_read_lock();
+
+ /* Free the object - this will internally schedule an RCU callback. */
+ kmem_cache_free(cache, p);
+
+ /*
+ * We should still be allowed to access the object at this point because
+ * the cache is SLAB_TYPESAFE_BY_RCU and we've been in an RCU read-side
+ * critical section since before the kmem_cache_free().
+ */
+ READ_ONCE(*p);
+
+ rcu_read_unlock();
+
+ /*
+ * Wait for the RCU callback to execute; after this, the object should
+ * have actually been freed from KASAN's perspective.
+ */
+ rcu_barrier();
+
+ KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p));
+
+ kmem_cache_destroy(cache);
+}
+
static void empty_cache_ctor(void *object) { }
static void kmem_cache_double_destroy(struct kunit *test)
@@ -1937,6 +1982,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmem_cache_oob),
KUNIT_CASE(kmem_cache_double_free),
KUNIT_CASE(kmem_cache_invalid_free),
+ KUNIT_CASE(kmem_cache_rcu_uaf),
KUNIT_CASE(kmem_cache_double_destroy),
KUNIT_CASE(kmem_cache_accounted),
KUNIT_CASE(kmem_cache_bulk),
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f29157288b7d..d563fb515766 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3613,8 +3613,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg1_soft_limit_reset(memcg);
#ifdef CONFIG_ZSWAP
memcg->zswap_max = PAGE_COUNTER_MAX;
- WRITE_ONCE(memcg->zswap_writeback,
- !parent || READ_ONCE(parent->zswap_writeback));
+ WRITE_ONCE(memcg->zswap_writeback, true);
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
@@ -5320,7 +5319,14 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
/* if zswap is disabled, do not block pages going to the swapping device */
- return !zswap_is_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback);
+ if (!zswap_is_enabled())
+ return true;
+
+ for (; memcg; memcg = parent_mem_cgroup(memcg))
+ if (!READ_ONCE(memcg->zswap_writeback))
+ return false;
+
+ return true;
}
static u64 zswap_current_read(struct cgroup_subsys_state *css,
diff --git a/mm/memory.c b/mm/memory.c
index 3c01d68065be..ebfc9768f801 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2632,11 +2632,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
-/*
- * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
- * must have pre-validated the caching bits of the pgprot_t.
- */
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
@@ -2689,6 +2685,27 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return 0;
}
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap. The caller
+ * must have pre-validated the caching bits of the pgprot_t.
+ */
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
+
+ if (!error)
+ return 0;
+
+ /*
+ * A partial pfn range mapping is dangerous: it does not
+ * maintain page reference counts, and callers may free
+ * pages due to the error. So zap it early.
+ */
+ zap_page_range_single(vma, addr, size, NULL);
+ return error;
+}
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c26ca1b..951878ab627a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/mmap.c b/mm/mmap.c
index d0dfc85b209b..6ddb278a5ee8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1229,7 +1229,7 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
return MAX_LFS_FILESIZE;
/* Special "we do even unsigned file positions" case */
- if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
return 0;
/* Yes, random drivers might want more. But I'm tired of buggy drivers */
diff --git a/mm/mseal.c b/mm/mseal.c
index 15bba28acc00..c8787cc6ba55 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -256,7 +256,7 @@ static int apply_mm_seal(unsigned long start, unsigned long end)
*
* unseal() is not supported.
*/
-static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
size_t len;
int ret = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c565de8f48e9..91ace8ca97e2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1054,6 +1054,13 @@ __always_inline bool free_pages_prepare(struct page *page,
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
+
+ /*
+ * The page is isolated and accounted for.
+ * Mark the codetag as empty to avoid accounting error
+ * when the page is freed by unpoison_memory().
+ */
+ clear_page_tag_ref(page);
return false;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a77acf6ac6a..b875852df51f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2878,7 +2878,7 @@ static const struct inode_operations shmem_short_symlink_operations;
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2899,23 +2899,22 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
if (ret)
return ret;
- *pagep = folio_file_page(folio, index);
- if (PageHWPoison(*pagep)) {
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
folio_unlock(folio);
folio_put(folio);
- *pagep = NULL;
return -EIO;
}
+ *foliop = folio;
return 0;
}
static int
shmem_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
if (pos + copied > inode->i_size)
diff --git a/mm/slab.h b/mm/slab.h
index dcdb56b8e7f5..f22fb760b286 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -422,7 +422,9 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller)
gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
-int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
+int do_kmem_cache_create(struct kmem_cache *s, const char *name,
+ unsigned int size, struct kmem_cache_args *args,
+ slab_flags_t flags);
void __init kmem_cache_init(void);
extern void create_boot_cache(struct kmem_cache *, const char *name,
@@ -443,6 +445,13 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
return (s->flags & SLAB_KMALLOC);
}
+static inline bool is_kmalloc_normal(struct kmem_cache *s)
+{
+ if (!is_kmalloc_cache(s))
+ return false;
+ return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
+}
+
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 40b582a014b8..61f32420230a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -40,11 +40,6 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
-static LIST_HEAD(slab_caches_to_rcu_destroy);
-static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
-static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
- slab_caches_to_rcu_destroy_workfn);
-
/*
* Set of flags that will prevent slab merging
*/
@@ -88,6 +83,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
+
+static bool kmem_cache_is_duplicate_name(const char *name)
+{
+ struct kmem_cache *s;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!strcmp(s->name, name))
+ return true;
+ }
+
+ return false;
+}
+
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
@@ -95,6 +103,10 @@ static int kmem_cache_sanity_check(const char *name, unsigned int size)
return -EINVAL;
}
+ /* Duplicate names will confuse slabtop, et al */
+ WARN(kmem_cache_is_duplicate_name(name),
+ "kmem_cache of name '%s' already exists\n", name);
+
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
return 0;
}
@@ -169,14 +181,15 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
if (ctor)
return NULL;
- size = ALIGN(size, sizeof(void *));
- align = calculate_alignment(flags, align, size);
- size = ALIGN(size, align);
flags = kmem_cache_flags(flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
+ size = ALIGN(size, sizeof(void *));
+ align = calculate_alignment(flags, align, size);
+ size = ALIGN(size, align);
+
list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -202,32 +215,29 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
}
static struct kmem_cache *create_cache(const char *name,
- unsigned int object_size, unsigned int align,
- slab_flags_t flags, unsigned int useroffset,
- unsigned int usersize, void (*ctor)(void *),
- struct kmem_cache *root_cache)
+ unsigned int object_size,
+ struct kmem_cache_args *args,
+ slab_flags_t flags)
{
struct kmem_cache *s;
int err;
- if (WARN_ON(useroffset + usersize > object_size))
- useroffset = usersize = 0;
+ if (WARN_ON(args->useroffset + args->usersize > object_size))
+ args->useroffset = args->usersize = 0;
+
+ /* If a custom freelist pointer is requested make sure it's sane. */
+ err = -EINVAL;
+ if (args->use_freeptr_offset &&
+ (args->freeptr_offset >= object_size ||
+ !(flags & SLAB_TYPESAFE_BY_RCU) ||
+ !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t))))
+ goto out;
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
goto out;
-
- s->name = name;
- s->size = s->object_size = object_size;
- s->align = align;
- s->ctor = ctor;
-#ifdef CONFIG_HARDENED_USERCOPY
- s->useroffset = useroffset;
- s->usersize = usersize;
-#endif
-
- err = __kmem_cache_create(s, flags);
+ err = do_kmem_cache_create(s, name, object_size, args, flags);
if (err)
goto out_free_cache;
@@ -242,39 +252,24 @@ out:
}
/**
- * kmem_cache_create_usercopy - Create a cache with a region suitable
- * for copying to userspace
+ * __kmem_cache_create_args - Create a kmem cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
- * @flags: SLAB flags
- * @useroffset: Usercopy region offset
- * @usersize: Usercopy region size
- * @ctor: A constructor for the objects.
- *
- * Cannot be called within a interrupt, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
+ * @object_size: The size of objects to be created in this cache.
+ * @args: Additional arguments for the cache creation (see
+ * &struct kmem_cache_args).
+ * @flags: See %SLAB_* flags for an explanation of individual @flags.
*
- * The flags are
+ * Not to be called directly, use the kmem_cache_create() wrapper with the same
+ * parameters.
*
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline. This can be beneficial if you're counting cycles as closely
- * as davem.
+ * Context: Cannot be called within a interrupt, but can be interrupted.
*
* Return: a pointer to the cache on success, NULL on failure.
*/
-struct kmem_cache *
-kmem_cache_create_usercopy(const char *name,
- unsigned int size, unsigned int align,
- slab_flags_t flags,
- unsigned int useroffset, unsigned int usersize,
- void (*ctor)(void *))
+struct kmem_cache *__kmem_cache_create_args(const char *name,
+ unsigned int object_size,
+ struct kmem_cache_args *args,
+ slab_flags_t flags)
{
struct kmem_cache *s = NULL;
const char *cache_name;
@@ -296,7 +291,7 @@ kmem_cache_create_usercopy(const char *name,
mutex_lock(&slab_mutex);
- err = kmem_cache_sanity_check(name, size);
+ err = kmem_cache_sanity_check(name, object_size);
if (err) {
goto out_unlock;
}
@@ -317,12 +312,14 @@ kmem_cache_create_usercopy(const char *name,
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
- WARN_ON(!usersize && useroffset) ||
- WARN_ON(size < usersize || size - usersize < useroffset))
- usersize = useroffset = 0;
-
- if (!usersize)
- s = __kmem_cache_alias(name, size, align, flags, ctor);
+ WARN_ON(!args->usersize && args->useroffset) ||
+ WARN_ON(object_size < args->usersize ||
+ object_size - args->usersize < args->useroffset))
+ args->usersize = args->useroffset = 0;
+
+ if (!args->usersize)
+ s = __kmem_cache_alias(name, object_size, args->align, flags,
+ args->ctor);
if (s)
goto out_unlock;
@@ -332,9 +329,8 @@ kmem_cache_create_usercopy(const char *name,
goto out_unlock;
}
- s = create_cache(cache_name, size,
- calculate_alignment(flags, align, size),
- flags, useroffset, usersize, ctor, NULL);
+ args->align = calculate_alignment(flags, args->align, object_size);
+ s = create_cache(cache_name, object_size, args, flags);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -356,41 +352,7 @@ out_unlock:
}
return s;
}
-EXPORT_SYMBOL(kmem_cache_create_usercopy);
-
-/**
- * kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
- * @flags: SLAB flags
- * @ctor: A constructor for the objects.
- *
- * Cannot be called within a interrupt, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * The flags are
- *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline. This can be beneficial if you're counting cycles as closely
- * as davem.
- *
- * Return: a pointer to the cache on success, NULL on failure.
- */
-struct kmem_cache *
-kmem_cache_create(const char *name, unsigned int size, unsigned int align,
- slab_flags_t flags, void (*ctor)(void *))
-{
- return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
- ctor);
-}
-EXPORT_SYMBOL(kmem_cache_create);
+EXPORT_SYMBOL(__kmem_cache_create_args);
static struct kmem_cache *kmem_buckets_cache __ro_after_init;
@@ -478,87 +440,25 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
fail:
for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++)
kmem_cache_destroy((*b)[idx]);
- kfree(b);
+ kmem_cache_free(kmem_buckets_cache, b);
return NULL;
}
EXPORT_SYMBOL(kmem_buckets_create);
-#ifdef SLAB_SUPPORTS_SYSFS
/*
* For a given kmem_cache, kmem_cache_destroy() should only be called
* once or there will be a use-after-free problem. The actual deletion
* and release of the kobject does not need slab_mutex or cpu_hotplug_lock
* protection. So they are now done without holding those locks.
- *
- * Note that there will be a slight delay in the deletion of sysfs files
- * if kmem_cache_release() is called indrectly from a work function.
*/
static void kmem_cache_release(struct kmem_cache *s)
{
- if (slab_state >= FULL) {
- sysfs_slab_unlink(s);
+ kfence_shutdown_cache(s);
+ if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
sysfs_slab_release(s);
- } else {
+ else
slab_kmem_cache_release(s);
- }
-}
-#else
-static void kmem_cache_release(struct kmem_cache *s)
-{
- slab_kmem_cache_release(s);
-}
-#endif
-
-static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
-{
- LIST_HEAD(to_destroy);
- struct kmem_cache *s, *s2;
-
- /*
- * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
- * @slab_caches_to_rcu_destroy list. The slab pages are freed
- * through RCU and the associated kmem_cache are dereferenced
- * while freeing the pages, so the kmem_caches should be freed only
- * after the pending RCU operations are finished. As rcu_barrier()
- * is a pretty slow operation, we batch all pending destructions
- * asynchronously.
- */
- mutex_lock(&slab_mutex);
- list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
- mutex_unlock(&slab_mutex);
-
- if (list_empty(&to_destroy))
- return;
-
- rcu_barrier();
-
- list_for_each_entry_safe(s, s2, &to_destroy, list) {
- debugfs_slab_release(s);
- kfence_shutdown_cache(s);
- kmem_cache_release(s);
- }
-}
-
-static int shutdown_cache(struct kmem_cache *s)
-{
- /* free asan quarantined objects */
- kasan_cache_shutdown(s);
-
- if (__kmem_cache_shutdown(s) != 0)
- return -EBUSY;
-
- list_del(&s->list);
-
- if (s->flags & SLAB_TYPESAFE_BY_RCU) {
- list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
- schedule_work(&slab_caches_to_rcu_destroy_work);
- } else {
- kfence_shutdown_cache(s);
- debugfs_slab_release(s);
- }
-
- return 0;
}
void slab_kmem_cache_release(struct kmem_cache *s)
@@ -570,29 +470,63 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- int err = -EBUSY;
- bool rcu_set;
+ int err;
if (unlikely(!s) || !kasan_check_byte(s))
return;
+ /* in-flight kfree_rcu()'s may include objects from our cache */
+ kvfree_rcu_barrier();
+
+ if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
+ (s->flags & SLAB_TYPESAFE_BY_RCU)) {
+ /*
+ * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
+ * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
+ * defer their freeing with call_rcu().
+ * Wait for such call_rcu() invocations here before actually
+ * destroying the cache.
+ *
+ * It doesn't matter that we haven't looked at the slab refcount
+ * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
+ * the refcount should be 1 here.
+ */
+ rcu_barrier();
+ }
+
cpus_read_lock();
mutex_lock(&slab_mutex);
- rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
-
s->refcount--;
- if (s->refcount)
- goto out_unlock;
+ if (s->refcount) {
+ mutex_unlock(&slab_mutex);
+ cpus_read_unlock();
+ return;
+ }
+
+ /* free asan quarantined objects */
+ kasan_cache_shutdown(s);
- err = shutdown_cache(s);
+ err = __kmem_cache_shutdown(s);
WARN(err, "%s %s: Slab cache still has objects when called from %pS",
__func__, s->name, (void *)_RET_IP_);
-out_unlock:
+
+ list_del(&s->list);
+
mutex_unlock(&slab_mutex);
cpus_read_unlock();
- if (!err && !rcu_set)
- kmem_cache_release(s);
+
+ if (slab_state >= FULL)
+ sysfs_slab_unlink(s);
+ debugfs_slab_release(s);
+
+ if (err)
+ return;
+
+ if (s->flags & SLAB_TYPESAFE_BY_RCU)
+ rcu_barrier();
+
+ kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -704,9 +638,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
{
int err;
unsigned int align = ARCH_KMALLOC_MINALIGN;
-
- s->name = name;
- s->size = s->object_size = size;
+ struct kmem_cache_args kmem_args = {};
/*
* kmalloc caches guarantee alignment of at least the largest
@@ -715,14 +647,14 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
*/
if (flags & SLAB_KMALLOC)
align = max(align, 1U << (ffs(size) - 1));
- s->align = calculate_alignment(flags, align, size);
+ kmem_args.align = calculate_alignment(flags, align, size);
#ifdef CONFIG_HARDENED_USERCOPY
- s->useroffset = useroffset;
- s->usersize = usersize;
+ kmem_args.useroffset = useroffset;
+ kmem_args.usersize = usersize;
#endif
- err = __kmem_cache_create(s, flags);
+ err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
if (err)
panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
diff --git a/mm/slub.c b/mm/slub.c
index c9d8a2497fd6..21f71cb6cc06 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -466,12 +466,6 @@ static struct workqueue_struct *flushwq;
*******************************************************************/
/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
-/*
* Returns freelist pointer (ptr). With hardening, this is obfuscated
* with an XOR of the address where the pointer is held and a per-cache
* random number.
@@ -756,6 +750,50 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
return false;
}
+/*
+ * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
+ * family will round up the real request size to these fixed ones, so
+ * there could be an extra area than what is requested. Save the original
+ * request size in the meta data area, for better debug and sanity check.
+ */
+static inline void set_orig_size(struct kmem_cache *s,
+ void *object, unsigned int orig_size)
+{
+ void *p = kasan_reset_tag(object);
+ unsigned int kasan_meta_size;
+
+ if (!slub_debug_orig_size(s))
+ return;
+
+ /*
+ * KASAN can save its free meta data inside of the object at offset 0.
+ * If this meta data size is larger than 'orig_size', it will overlap
+ * the data redzone in [orig_size+1, object_size]. Thus, we adjust
+ * 'orig_size' to be as at least as big as KASAN's meta data.
+ */
+ kasan_meta_size = kasan_metadata_size(s, true);
+ if (kasan_meta_size > orig_size)
+ orig_size = kasan_meta_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ *(unsigned int *)p = orig_size;
+}
+
+static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return s->object_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ return *(unsigned int *)p;
+}
+
#ifdef CONFIG_SLUB_DEBUG
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_SPINLOCK(object_map_lock);
@@ -985,50 +1023,6 @@ static void print_slab_info(const struct slab *slab)
&slab->__page_flags);
}
-/*
- * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
- * family will round up the real request size to these fixed ones, so
- * there could be an extra area than what is requested. Save the original
- * request size in the meta data area, for better debug and sanity check.
- */
-static inline void set_orig_size(struct kmem_cache *s,
- void *object, unsigned int orig_size)
-{
- void *p = kasan_reset_tag(object);
- unsigned int kasan_meta_size;
-
- if (!slub_debug_orig_size(s))
- return;
-
- /*
- * KASAN can save its free meta data inside of the object at offset 0.
- * If this meta data size is larger than 'orig_size', it will overlap
- * the data redzone in [orig_size+1, object_size]. Thus, we adjust
- * 'orig_size' to be as at least as big as KASAN's meta data.
- */
- kasan_meta_size = kasan_metadata_size(s, true);
- if (kasan_meta_size > orig_size)
- orig_size = kasan_meta_size;
-
- p += get_info_end(s);
- p += sizeof(struct track) * 2;
-
- *(unsigned int *)p = orig_size;
-}
-
-static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
-{
- void *p = kasan_reset_tag(object);
-
- if (!slub_debug_orig_size(s))
- return s->object_size;
-
- p += get_info_end(s);
- p += sizeof(struct track) * 2;
-
- return *(unsigned int *)p;
-}
-
void skip_orig_size_check(struct kmem_cache *s, const void *object)
{
set_orig_size(s, (void *)object, s->object_size);
@@ -1894,7 +1888,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-
#ifndef CONFIG_SLUB_TINY
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
@@ -2116,6 +2109,10 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
if (!mem_alloc_profiling_enabled())
return;
+ /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
+ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ return;
+
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return;
@@ -2185,6 +2182,45 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
}
+
+static __fastpath_inline
+bool memcg_slab_post_charge(void *p, gfp_t flags)
+{
+ struct slabobj_ext *slab_exts;
+ struct kmem_cache *s;
+ struct folio *folio;
+ struct slab *slab;
+ unsigned long off;
+
+ folio = virt_to_folio(p);
+ if (!folio_test_slab(folio)) {
+ return folio_memcg_kmem(folio) ||
+ (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
+ folio_order(folio)) == 0);
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+
+ /*
+ * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
+ * of slab_obj_exts being allocated from the same slab and thus the slab
+ * becoming effectively unfreeable.
+ */
+ if (is_kmalloc_normal(s))
+ return true;
+
+ /* Ignore already charged objects. */
+ slab_exts = slab_obj_exts(slab);
+ if (slab_exts) {
+ off = obj_to_index(s, slab, p);
+ if (unlikely(slab_exts[off].objcg))
+ return true;
+ }
+
+ return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
+}
+
#else /* CONFIG_MEMCG */
static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct list_lru *lru,
@@ -2198,18 +2234,37 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
}
+
+static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
+{
+ return true;
+}
#endif /* CONFIG_MEMCG */
+#ifdef CONFIG_SLUB_RCU_DEBUG
+static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
+
+struct rcu_delayed_free {
+ struct rcu_head head;
+ void *object;
+};
+#endif
+
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*
* Returns true if freeing of the object can proceed, false if its reuse
- * was delayed by KASAN quarantine, or it was returned to KFENCE.
+ * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
+ * to KFENCE.
*/
static __always_inline
-bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
+bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
+ bool after_rcu_delay)
{
+ /* Are the object contents still accessible? */
+ bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
+
kmemleak_free_recursive(x, s->flags);
kmsan_slab_free(s, x);
@@ -2219,7 +2274,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
debug_check_no_obj_freed(x, s->object_size);
/* Use KCSAN to help debug racy use-after-free. */
- if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ if (!still_accessible)
__kcsan_check_access(x, s->object_size,
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
@@ -2227,6 +2282,35 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
return false;
/*
+ * Give KASAN a chance to notice an invalid free operation before we
+ * modify the object.
+ */
+ if (kasan_slab_pre_free(s, x))
+ return false;
+
+#ifdef CONFIG_SLUB_RCU_DEBUG
+ if (still_accessible) {
+ struct rcu_delayed_free *delayed_free;
+
+ delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
+ if (delayed_free) {
+ /*
+ * Let KASAN track our call stack as a "related work
+ * creation", just like if the object had been freed
+ * normally via kfree_rcu().
+ * We have to do this manually because the rcu_head is
+ * not located inside the object.
+ */
+ kasan_record_aux_stack_noalloc(x);
+
+ delayed_free->object = x;
+ call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
+ return false;
+ }
+ }
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
+ /*
* As memory initialization might be integrated into KASAN,
* kasan_slab_free and initialization memset's must be
* kept together to avoid discrepancies in behavior.
@@ -2239,17 +2323,24 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
*/
if (unlikely(init)) {
int rsize;
- unsigned int inuse;
+ unsigned int inuse, orig_size;
inuse = get_info_end(s);
+ orig_size = get_orig_size(s, x);
if (!kasan_has_integrated_init())
- memset(kasan_reset_tag(x), 0, s->object_size);
+ memset(kasan_reset_tag(x), 0, orig_size);
rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
memset((char *)kasan_reset_tag(x) + inuse, 0,
s->size - inuse - rsize);
+ /*
+ * Restore orig_size, otherwize kmalloc redzone overwritten
+ * would be reported
+ */
+ set_orig_size(s, x, orig_size);
+
}
/* KASAN might put x into memory quarantine, delaying its reuse. */
- return !kasan_slab_free(s, x, init);
+ return !kasan_slab_free(s, x, init, still_accessible);
}
static __fastpath_inline
@@ -2263,7 +2354,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
bool init;
if (is_kfence_address(next)) {
- slab_free_hook(s, next, false);
+ slab_free_hook(s, next, false, false);
return false;
}
@@ -2278,7 +2369,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
next = get_freepointer(s, object);
/* If object's reuse doesn't have to be delayed */
- if (likely(slab_free_hook(s, object, init))) {
+ if (likely(slab_free_hook(s, object, init, false))) {
/* Move object to the new freelist */
set_freepointer(s, object, *head);
*head = object;
@@ -2318,7 +2409,11 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
struct slab *slab;
unsigned int order = oo_order(oo);
- folio = (struct folio *)alloc_pages_node(node, flags, order);
+ if (node == NUMA_NO_NODE)
+ folio = (struct folio *)alloc_pages(flags, order);
+ else
+ folio = (struct folio *)__alloc_pages_node(node, flags, order);
+
if (!folio)
return NULL;
@@ -3416,14 +3511,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ int cpu = raw_smp_processor_id();
int node;
struct kmem_cache_node *n;
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
return;
- pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
- nid, gfpflags, &gfpflags);
+ pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
+ cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
@@ -3921,6 +4017,8 @@ static void *__slab_alloc_node(struct kmem_cache *s,
/*
* If the object has been wiped upon free, make sure it's fully initialized by
* zeroing out freelist pointer.
+ *
+ * Note that we also wipe custom freelist pointers.
*/
static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
void *obj)
@@ -4062,6 +4160,15 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
}
EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
+bool kmem_cache_charge(void *objp, gfp_t gfpflags)
+{
+ if (!memcg_kmem_online())
+ return true;
+
+ return memcg_slab_post_charge(objp, gfpflags);
+}
+EXPORT_SYMBOL(kmem_cache_charge);
+
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @s: The cache to allocate from.
@@ -4470,7 +4577,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
memcg_slab_free_hook(s, slab, &object, 1);
alloc_tagging_slab_free_hook(s, slab, &object, 1);
- if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
+ if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
do_slab_free(s, slab, object, object, 1, addr);
}
@@ -4479,7 +4586,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
static noinline
void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
{
- if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
+ if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
}
#endif
@@ -4498,6 +4605,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
do_slab_free(s, slab, head, tail, cnt, addr);
}
+#ifdef CONFIG_SLUB_RCU_DEBUG
+static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
+{
+ struct rcu_delayed_free *delayed_free =
+ container_of(rcu_head, struct rcu_delayed_free, head);
+ void *object = delayed_free->object;
+ struct slab *slab = virt_to_slab(object);
+ struct kmem_cache *s;
+
+ kfree(delayed_free);
+
+ if (WARN_ON(is_kfence_address(object)))
+ return;
+
+ /* find the object and the cache again */
+ if (WARN_ON(!slab))
+ return;
+ s = slab->slab_cache;
+ if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
+ return;
+
+ /* resume freeing */
+ if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
+ do_slab_free(s, slab, object, object, 1, _THIS_IP_);
+}
+#endif /* CONFIG_SLUB_RCU_DEBUG */
+
#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
@@ -5148,7 +5282,7 @@ static void set_cpu_partial(struct kmem_cache *s)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s)
+static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
@@ -5189,7 +5323,8 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->inuse = size;
- if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
+ if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
+ (flags & SLAB_POISON) || s->ctor ||
((flags & SLAB_RED_ZONE) &&
(s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
/*
@@ -5210,6 +5345,8 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->offset = size;
size += sizeof(void *);
+ } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
+ s->offset = args->freeptr_offset;
} else {
/*
* Store freelist pointer near middle of object to keep
@@ -5284,65 +5421,6 @@ static int calculate_sizes(struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
-{
- s->flags = kmem_cache_flags(flags, s->name);
-#ifdef CONFIG_SLAB_FREELIST_HARDENED
- s->random = get_random_long();
-#endif
-
- if (!calculate_sizes(s))
- goto error;
- if (disable_higher_order_debug) {
- /*
- * Disable debugging flags that store metadata if the min slab
- * order increased.
- */
- if (get_order(s->size) > get_order(s->object_size)) {
- s->flags &= ~DEBUG_METADATA_FLAGS;
- s->offset = 0;
- if (!calculate_sizes(s))
- goto error;
- }
- }
-
-#ifdef system_has_freelist_aba
- if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
- /* Enable fast mode */
- s->flags |= __CMPXCHG_DOUBLE;
- }
-#endif
-
- /*
- * The larger the object size is, the more slabs we want on the partial
- * list to avoid pounding the page allocator excessively.
- */
- s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
- s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
-
- set_cpu_partial(s);
-
-#ifdef CONFIG_NUMA
- s->remote_node_defrag_ratio = 1000;
-#endif
-
- /* Initialize the pre-computed randomized freelist if slab is up */
- if (slab_state >= UP) {
- if (init_cache_random_seq(s))
- goto error;
- }
-
- if (!init_kmem_cache_nodes(s))
- goto error;
-
- if (alloc_kmem_cache_cpus(s))
- return 0;
-
-error:
- __kmem_cache_release(s);
- return -EINVAL;
-}
-
static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
const char *text)
{
@@ -5896,28 +5974,90 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
return s;
}
-int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
+int do_kmem_cache_create(struct kmem_cache *s, const char *name,
+ unsigned int size, struct kmem_cache_args *args,
+ slab_flags_t flags)
{
- int err;
+ int err = -EINVAL;
- err = kmem_cache_open(s, flags);
- if (err)
- return err;
+ s->name = name;
+ s->size = s->object_size = size;
+
+ s->flags = kmem_cache_flags(flags, s->name);
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ s->random = get_random_long();
+#endif
+ s->align = args->align;
+ s->ctor = args->ctor;
+#ifdef CONFIG_HARDENED_USERCOPY
+ s->useroffset = args->useroffset;
+ s->usersize = args->usersize;
+#endif
+
+ if (!calculate_sizes(args, s))
+ goto out;
+ if (disable_higher_order_debug) {
+ /*
+ * Disable debugging flags that store metadata if the min slab
+ * order increased.
+ */
+ if (get_order(s->size) > get_order(s->object_size)) {
+ s->flags &= ~DEBUG_METADATA_FLAGS;
+ s->offset = 0;
+ if (!calculate_sizes(args, s))
+ goto out;
+ }
+ }
+
+#ifdef system_has_freelist_aba
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
+ /* Enable fast mode */
+ s->flags |= __CMPXCHG_DOUBLE;
+ }
+#endif
+
+ /*
+ * The larger the object size is, the more slabs we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
+ s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
+
+ set_cpu_partial(s);
+
+#ifdef CONFIG_NUMA
+ s->remote_node_defrag_ratio = 1000;
+#endif
+
+ /* Initialize the pre-computed randomized freelist if slab is up */
+ if (slab_state >= UP) {
+ if (init_cache_random_seq(s))
+ goto out;
+ }
+
+ if (!init_kmem_cache_nodes(s))
+ goto out;
+
+ if (!alloc_kmem_cache_cpus(s))
+ goto out;
/* Mutex is not taken during early boot */
- if (slab_state <= UP)
- return 0;
+ if (slab_state <= UP) {
+ err = 0;
+ goto out;
+ }
err = sysfs_slab_add(s);
- if (err) {
- __kmem_cache_release(s);
- return err;
- }
+ if (err)
+ goto out;
if (s->flags & SLAB_STORE_USER)
debugfs_slab_add(s);
- return 0;
+out:
+ if (err)
+ __kmem_cache_release(s);
+ return err;
}
#ifdef SLAB_SUPPORTS_SYSFS
diff --git a/mm/sparse.c b/mm/sparse.c
index 0f018c6f9ec5..dc38539f8560 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e54e5c8907fa..acc56c75ba99 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -787,27 +787,30 @@ retry:
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
+ pmd_devmap(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
-
- BUG_ON(pmd_none(*dst_pmd));
- BUG_ON(pmd_trans_huge(*dst_pmd));
+ /*
+ * For shmem mappings, khugepaged is allowed to remove page
+ * tables under us; pte_offset_map_lock() will deal with that.
+ */
err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
src_addr, flags, &folio);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af2de36549d6..a0df1e2e155a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2191,6 +2191,7 @@ static void purge_vmap_node(struct work_struct *work)
{
struct vmap_node *vn = container_of(work,
struct vmap_node, purge_work);
+ unsigned long nr_purged_pages = 0;
struct vmap_area *va, *n_va;
LIST_HEAD(local_list);
@@ -2208,7 +2209,7 @@ static void purge_vmap_node(struct work_struct *work)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
- atomic_long_sub(nr, &vmap_lazy_nr);
+ nr_purged_pages += nr;
vn->nr_purged++;
if (is_vn_id_valid(vn_id) && !vn->skip_populate)
@@ -2219,6 +2220,8 @@ static void purge_vmap_node(struct work_struct *work)
list_add(&va->list, &local_list);
}
+ atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);
+
reclaim_list_global(&local_list);
}
@@ -2626,6 +2629,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
vb->dirty_max = 0;
bitmap_set(vb->used_map, 0, (1UL << order));
INIT_LIST_HEAD(&vb->free_list);
+ vb->cpu = raw_smp_processor_id();
xa = addr_to_vb_xa(va->va_start);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -2642,7 +2646,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
* integrity together with list_for_each_rcu from read
* side.
*/
- vb->cpu = raw_smp_processor_id();
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfa839284b92..bd489c1af228 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1604,25 +1604,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-#ifdef CONFIG_CMA
-/*
- * It is waste of effort to scan and reclaim CMA pages if it is not available
- * for current allocation context. Kswapd can not be enrolled as it can not
- * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
- */
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return !current_is_kswapd() &&
- gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
- folio_migratetype(folio) == MIGRATE_CMA;
-}
-#else
-static bool skip_cma(struct folio *folio, struct scan_control *sc)
-{
- return false;
-}
-#endif
-
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
@@ -1669,8 +1650,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx ||
- skip_cma(folio, sc)) {
+ if (folio_zonenum(folio) > sc->reclaim_idx) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
goto move;
@@ -4320,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
- if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
+ if (zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;