diff options
-rw-r--r-- | include/linux/slab.h | 144 | ||||
-rw-r--r-- | include/trace/events/kmem.h | 74 | ||||
-rw-r--r-- | mm/kfence/report.c | 1 | ||||
-rw-r--r-- | mm/slab.c | 305 | ||||
-rw-r--r-- | mm/slab.h | 10 | ||||
-rw-r--r-- | mm/slab_common.c | 241 | ||||
-rw-r--r-- | mm/slob.c | 31 | ||||
-rw-r--r-- | mm/slub.c | 722 |
8 files changed, 661 insertions, 867 deletions
diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..9b592e611cb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -187,7 +187,6 @@ int kmem_cache_shrink(struct kmem_cache *s); void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); -size_t __ksize(const void *objp); size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); @@ -243,27 +242,17 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLAB /* - * The largest kmalloc size supported by the SLAB allocators is - * 32 megabyte (2^25) or the maximum allocatable page order if that is - * less than 32 MB. - * - * WARNING: Its not easy to increase this value since the allocators have - * to do various tricks to work around compiler limitations in order to - * ensure proper constant folding. + * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ - (MAX_ORDER + PAGE_SHIFT - 1) : 25) -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB -/* - * SLUB directly allocates requests fitting in to an order-1 page - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. - */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW @@ -415,10 +404,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; - if (size <= 4 * 1024 * 1024) return 22; - if (size <= 8 * 1024 * 1024) return 23; - if (size <= 16 * 1024 * 1024) return 24; - if (size <= 32 * 1024 * 1024) return 25; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); @@ -428,6 +413,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, /* Will never be reached. Needed because the compiler may complain */ return -1; } +static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ @@ -456,42 +442,22 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; -#else -static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __kmalloc(size, flags); -} - -static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) -{ - return kmem_cache_alloc(s, flags); -} -#endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __alloc_size(3); - -#ifdef CONFIG_NUMA -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment - __alloc_size(4); -#else -static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, size_t size) -{ - return kmem_cache_alloc_trace(s, gfpflags, size); -} -#endif /* CONFIG_NUMA */ +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_kmalloc_alignment + __alloc_size(4); #else /* CONFIG_TRACING */ -static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +/* Save a function call when CONFIG_TRACING=n */ +static __always_inline __alloc_size(3) +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -499,8 +465,9 @@ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_ return ret; } -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +static __always_inline __alloc_size(4) +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -509,25 +476,11 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __alloc_size(1); - -#ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __alloc_size(1); -#else -static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, - unsigned int order) -{ - return kmalloc_order(size, flags, order); -} -#endif +void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment + __alloc_size(1); -static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) -{ - unsigned int order = get_order(size); - return kmalloc_order_trace(size, flags, order); -} +void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment + __alloc_size(1); /** * kmalloc - allocate memory @@ -597,7 +550,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace( + return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); #endif @@ -605,23 +558,35 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } +#ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { -#ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE) { - unsigned int i = kmalloc_index(size); + if (__builtin_constant_p(size)) { + unsigned int index; + + if (size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + index = kmalloc_index(size); - if (!i) + if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace( - kmalloc_caches[kmalloc_type(flags)][i], - flags, node, size); + return kmalloc_node_trace( + kmalloc_caches[kmalloc_type(flags)][index], + flags, node, size); } -#endif return __kmalloc_node(size, flags, node); } +#else +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + return __kmalloc_node(size, flags, node); +} +#endif /** * kmalloc_array - allocate memory for an array. @@ -671,6 +636,12 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag return kmalloc_array(n, size, flags | __GFP_ZERO); } +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +#define kmalloc_node_track_caller(size, flags, node) \ + __kmalloc_node_track_caller(size, flags, node, \ + _RET_IP_) + /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead @@ -679,9 +650,9 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, _RET_IP_) + __kmalloc_node_track_caller(size, flags, \ + NUMA_NO_NODE, _RET_IP_) static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, int node) @@ -700,21 +671,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } - -#ifdef CONFIG_NUMA -extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node_track_caller(size, flags, node, \ - _RET_IP_) - -#else /* CONFIG_NUMA */ - -#define kmalloc_node_track_caller(size, flags, node) \ - kmalloc_track_caller(size, flags) - -#endif /* CONFIG_NUMA */ - /* * Shortcuts */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 4cb51ace600d..243073cfc29d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -9,16 +9,15 @@ #include <linux/tracepoint.h> #include <trace/events/mmflags.h> -DECLARE_EVENT_CLASS(kmem_alloc, +TRACE_EVENT(kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), + gfp_t gfp_flags, + int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags), + TP_ARGS(call_site, ptr, s, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) @@ -26,56 +25,42 @@ DECLARE_EVENT_CLASS(kmem_alloc, __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) + __field( int, node ) __field( bool, accounted ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; + __entry->bytes_req = s->object_size; + __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->node = node; __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; + (s->flags & SLAB_ACCOUNT)) : false; ), - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s accounted=%s", + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), + __entry->node, __entry->accounted ? "true" : "false") ); -DEFINE_EVENT(kmem_alloc, kmalloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DECLARE_EVENT_CLASS(kmem_alloc_node, +TRACE_EVENT(kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node), + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) @@ -84,7 +69,6 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) __field( int, node ) - __field( bool, accounted ) ), TP_fast_assign( @@ -94,9 +78,6 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? - ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", @@ -106,25 +87,8 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, - __entry->accounted ? "true" : "false") -); - -DEFINE_EVENT(kmem_alloc_node, kmalloc_node, - - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) -); - -DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node, - - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) + (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); TRACE_EVENT(kfree, @@ -149,20 +113,20 @@ TRACE_EVENT(kfree, TRACE_EVENT(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr, const char *name), + TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s), - TP_ARGS(call_site, ptr, name), + TP_ARGS(call_site, ptr, s), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) - __string( name, name ) + __string( name, s->name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __assign_str(name, name); + __assign_str(name, s->name); ), TP_printk("call_site=%pS ptr=%p name=%s", diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f5a6d8ba3e21..7e496856c2eb 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* Also the *_bulk() variants by only checking prefixes. */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..a5486ff8362a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3181,84 +3181,46 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, - unsigned long caller) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - unsigned long save_flags; - void *ptr; + void *objp = NULL; int slab_node = numa_mem_id(); - struct obj_cgroup *objcg = NULL; - bool init = false; - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - ptr = kfence_alloc(cachep, orig_size, flags); - if (unlikely(ptr)) - goto out_hooks; - - local_irq_save(save_flags); - - if (nodeid == NUMA_NO_NODE) - nodeid = slab_node; - - if (unlikely(!get_node(cachep, nodeid))) { - /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); - goto out; - } - - if (nodeid == slab_node) { + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); - if (ptr) - goto out; - } - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); -out: - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - init = slab_want_init_on_alloc(flags, cachep); - -out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); - return ptr; -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) -{ - void *objp; - - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cache, flags); - if (objp) - goto out; + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; } - objp = ____cache_alloc(cache, flags); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - + objp = ____cache_alloc_node(cachep, flags, nodeid); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) { return ____cache_alloc(cachep, flags); } @@ -3266,8 +3228,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3284,7 +3246,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); + objp = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3295,6 +3257,14 @@ out: return objp; } +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + /* * Caller needs to acquire correct kmem_cache_node's list_lock * @list: List of detached free slabs should be freed by caller @@ -3470,8 +3440,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, flags); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); return ret; } @@ -3521,7 +3490,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); if (unlikely(!objp)) goto error; @@ -3548,23 +3518,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -#ifdef CONFIG_TRACING -void * -kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) -{ - void *ret; - - ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, cachep, - size, cachep->size, flags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - -#ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3580,66 +3533,22 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, - flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid, - size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) { - void *ret; - - ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, cachep, - size, cachep->size, - flags, nodeid); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - ret = kasan_kmalloc(cachep, ret, size, flags); - - return ret; + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); } -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { @@ -3662,45 +3571,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -/** - * __do_kmalloc - allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @caller: function caller for debug tracking of the caller - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - unsigned long caller) +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = slab_alloc(cachep, NULL, flags, size, caller); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, cachep, - size, cachep->size, flags); - - return ret; -} + unsigned long flags; -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc(size, flags, _RET_IP_); + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); } -EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - return __do_kmalloc(size, flags, caller); + __do_kmem_cache_free(cachep, objp, caller); } -EXPORT_SYMBOL(__kmalloc_track_caller); /** * kmem_cache_free - Deallocate an object @@ -3712,34 +3601,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller); */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { - unsigned long flags; cachep = cache_from_obj(cachep, objp); if (!cachep) return; - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, _RET_IP_); - local_irq_restore(flags); + trace_kmem_cache_free(_RET_IP_, objp, cachep); + __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { - struct kmem_cache *s; - size_t i; local_irq_disable(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { void *objp = p[i]; + struct kmem_cache *s; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { s = cache_from_obj(orig_s, objp); + } + if (!s) continue; @@ -3755,39 +3648,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/** - * kfree - free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - struct kmem_cache *c; - unsigned long flags; - - trace_kfree(_RET_IP_, objp); - - if (unlikely(ZERO_OR_NULL_PTR(objp))) - return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); - return; - } - debug_check_no_locks_freed(objp, c->object_size); - - debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, _RET_IP_); - local_irq_restore(flags); -} -EXPORT_SYMBOL(kfree); - /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ @@ -4190,28 +4050,3 @@ void __check_heap_object(const void *ptr, unsigned long n, usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ - -/** - * __ksize -- Uninstrumented ksize. - * @objp: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @objp in bytes - */ -size_t __ksize(const void *objp) -{ - struct kmem_cache *c; - size_t size; - - BUG_ON(!objp); - if (unlikely(objp == ZERO_SIZE_PTR)) - return 0; - - c = virt_to_cache(objp); - size = c ? c->object_size : 0; - - return size; -} -EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ec..65023f000d42 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif gfp_t kmalloc_fix_flags(gfp_t flags); @@ -658,8 +663,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ +size_t __ksize(const void *objp); + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slab_common.c b/mm/slab_common.c index 0dfa3cfb6be5..46b2df8a7532 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -392,6 +392,28 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, } EXPORT_SYMBOL(kmem_cache_create); +#ifdef SLAB_SUPPORTS_SYSFS +/* + * For a given kmem_cache, kmem_cache_destroy() should only be called + * once or there will be a use-after-free problem. The actual deletion + * and release of the kobject does not need slab_mutex or cpu_hotplug_lock + * protection. So they are now done without holding those locks. + * + * Note that there will be a slight delay in the deletion of sysfs files + * if kmem_cache_release() is called indrectly from a work function. + */ +static void kmem_cache_release(struct kmem_cache *s) +{ + sysfs_slab_unlink(s); + sysfs_slab_release(s); +} +#else +static void kmem_cache_release(struct kmem_cache *s) +{ + slab_kmem_cache_release(s); +} +#endif + static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { LIST_HEAD(to_destroy); @@ -418,11 +440,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) list_for_each_entry_safe(s, s2, &to_destroy, list) { debugfs_slab_release(s); kfence_shutdown_cache(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif + kmem_cache_release(s); } } @@ -437,20 +455,11 @@ static int shutdown_cache(struct kmem_cache *s) list_del(&s->list); if (s->flags & SLAB_TYPESAFE_BY_RCU) { -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); -#endif list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { kfence_shutdown_cache(s); debugfs_slab_release(s); -#ifdef SLAB_SUPPORTS_SYSFS - sysfs_slab_unlink(s); - sysfs_slab_release(s); -#else - slab_kmem_cache_release(s); -#endif } return 0; @@ -465,14 +474,19 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { + int refcnt; + bool rcu_set; + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); mutex_lock(&slab_mutex); - s->refcount--; - if (s->refcount) + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; + + refcnt = --s->refcount; + if (refcnt) goto out_unlock; WARN(shutdown_cache(s), @@ -481,6 +495,8 @@ void kmem_cache_destroy(struct kmem_cache *s) out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); + if (!refcnt && !rcu_set) + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -740,8 +756,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -765,11 +781,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -882,6 +894,144 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(_RET_IP_, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/* Uninstrumented ksize. Only called by KASAN. */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; + return folio_size(folio); + } + + return slab_ksize(folio_slab(folio)->slab_cache); +} + +#ifdef CONFIG_TRACING +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); +#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -901,37 +1051,50 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = NULL; struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - page = alloc_pages(flags, order); - if (likely(page)) { - ret = page_address(page); + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); } - ret = kasan_kmalloc_large(ret, size, flags); - /* As ret might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ret, size, 1, flags); + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_order); +EXPORT_SYMBOL(kmalloc_large); -#ifdef CONFIG_TRACING -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmalloc_large_node); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ @@ -1263,8 +1426,6 @@ EXPORT_SYMBOL(ksize); /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..45a061b8ba38 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, NULL, - size, size + minalign, gfp, node); + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif void kfree(const void *block) { @@ -594,7 +584,6 @@ size_t __ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { @@ -602,6 +591,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; c->flags = flags; return 0; } @@ -616,14 +608,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } if (b && c->ctor) { @@ -647,7 +635,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_ return slob_alloc_node(cachep, flags, NUMA_NO_NODE); } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_NUMA + void *__kmalloc_node(size_t size, gfp_t gfp, int node) { return __do_kmalloc_node(size, gfp, node, _RET_IP_); @@ -659,7 +647,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) return slob_alloc_node(cachep, gfp, node); } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif static void __kmem_cache_free(void *b, int size) { @@ -680,7 +667,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c->name); + trace_kmem_cache_free(_RET_IP_, b, c); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); diff --git a/mm/slub.c b/mm/slub.c index 8f80d9bc507f..4aa18dd9c4e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -50,7 +50,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -64,8 +64,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -94,15 +95,20 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -163,8 +169,9 @@ * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -176,6 +183,7 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) #endif #ifdef CONFIG_SLUB_DEBUG @@ -310,6 +318,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) */ static nodemask_t slab_nodes; +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -442,7 +455,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct slab *slab) +static __always_inline void slab_lock(struct slab *slab) { struct page *page = slab_page(slab); @@ -450,7 +463,7 @@ static __always_inline void __slab_lock(struct slab *slab) bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct slab *slab) +static __always_inline void slab_unlock(struct slab *slab) { struct page *page = slab_page(slab); @@ -458,31 +471,19 @@ static __always_inline void __slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); -} - -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) -{ - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); -} - /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -494,18 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab } else #endif { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - slab_unlock(slab, &flags); + slab_unlock(slab); return true; } - slab_unlock(slab, &flags); + slab_unlock(slab); } cpu_relax(); @@ -536,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, unsigned long flags; local_irq_save(flags); - __slab_lock(slab); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); } @@ -561,7 +559,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -595,30 +593,6 @@ static bool slab_add_kunit_errors(void) static inline bool slab_add_kunit_errors(void) { return false; } #endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) -{ - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, slab); - - return object_map; -} - -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); -} - static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) @@ -1324,17 +1298,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) + struct slab *slab, void *object) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1385,63 +1356,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -1661,16 +1575,18 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object) { return 0; } -static inline int free_debug_processing( +static inline void free_debug_processing( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } + unsigned long addr) {} static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1704,20 +1620,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - return ptr; -} - -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { @@ -1976,11 +1878,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; account_slab(slab, oo_order(oo), s, flags); @@ -2007,15 +1911,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2103,6 +1998,75 @@ static inline void remove_partial(struct kmem_cache_node *n, } /* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + +/* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. * @@ -2182,6 +2146,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(slab, gfpflags)) continue; + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab); + if (object) + break; + continue; + } + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; @@ -2730,7 +2701,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) INIT_WORK(&sfw->work, flush_cpu_slab); sfw->skip = false; sfw->s = s; - schedule_work_on(cpu, &sfw->work); + queue_work_on(cpu, flushwq, &sfw->work); } for_each_online_cpu(cpu) { @@ -2788,6 +2759,110 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static noinline void free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + void *object = head; + int cnt = 0; + unsigned long flags; + bool checks_ok = false; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + bulk_cnt, cnt); + +out: + if (checks_ok) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* Do we need to remove the slab from full or partial list? */ + if (!prior) { + remove_full(s, n, slab); + } else if (slab->inuse == 0 && + n->nr_partial >= s->min_partial) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + + /* Do we need to discard the slab or add to partial list? */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) { + slab_free = slab; + } else if (!prior) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) @@ -3036,36 +3111,52 @@ new_objects: return NULL; } + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); + inc_slabs_node(s, slab_nid(slab), slab->objects); check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + return freelist; } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: @@ -3089,11 +3180,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } /* @@ -3197,14 +3283,8 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { @@ -3257,8 +3337,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } @@ -3276,46 +3355,24 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) { - void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); } -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif -#ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, s, - s->object_size, s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) -{ - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, - size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif /* CONFIG_NUMA */ - /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -3341,9 +3398,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -3463,6 +3521,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; redo: /* @@ -3477,9 +3536,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3491,16 +3554,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; - + } else { + /* Update the free list under the local lock */ local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { @@ -3515,11 +3570,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); - + } + stat(s, FREE_FASTPATH); } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, @@ -3542,12 +3594,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); + trace_kmem_cache_free(_RET_IP_, x, s); slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -3560,19 +3617,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3936,6 +3980,7 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -3950,7 +3995,6 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -4237,23 +4281,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); - map = get_map(s, slab); + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); - slab_unlock(slab, &flags); + spin_unlock(&object_map_lock); #endif } @@ -4404,78 +4446,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc); - -#ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - return kmalloc_large_node_hook(ptr, size, flags); -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4526,43 +4496,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4611,6 +4544,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4626,7 +4560,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -4858,6 +4792,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); } struct kmem_cache * @@ -4908,60 +4844,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, gfpflags, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, s, size, s->size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - gfpflags, node); - - return ret; - } - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif - #ifdef CONFIG_SYSFS static int count_inuse(struct slab *slab) { @@ -4980,12 +4862,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - - slab_lock(slab, &flags); if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -4996,8 +4875,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, @@ -5601,7 +5478,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -5885,7 +5762,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5946,6 +5824,8 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; |