diff options
-rw-r--r-- | mm/slub.c | 394 |
1 files changed, 188 insertions, 206 deletions
diff --git a/mm/slub.c b/mm/slub.c index ccd57636b739..fac07382d3a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -77,13 +77,28 @@ * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is not - * on any list except per cpu partial list. The processor that froze the + * If a slab is frozen then it is exempt from list management. It is + * the cpu slab which is actively allocated from by the processor that + * froze it and it is not on any list. The processor that froze the * slab is the one who can perform list operations on the slab. Other * processors may put objects onto the freelist but the processor that * froze the slab is the only one that can retrieve the objects from the * slab's freelist. * + * CPU partial slabs + * + * The partially empty slabs cached on the CPU partial list are used + * for performance reasons, which speeds up the allocation process. + * These slabs are not frozen, but are also exempt from list management, + * by clearing the PG_workingset flag when moving out of the node + * partial list. Please see __slab_free() for more details. + * + * To sum up, the current scheme is: + * - node partial slab: PG_Workingset && !frozen + * - cpu partial slab: !PG_Workingset && !frozen + * - cpu slab: !PG_Workingset && frozen + * - full slab: !PG_Workingset && !frozen + * * list_lock * * The list_lock protects the partial and full list on each node and @@ -205,9 +220,9 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); /* Structure holding parameters for get_partial() call chain */ struct partial_context { - struct slab **slab; gfp_t flags; unsigned int orig_size; + void *object; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -612,7 +627,7 @@ static __always_inline void slab_unlock(struct slab *slab) struct page *page = slab_page(slab); VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &page->flags); } static inline bool @@ -2432,6 +2447,25 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) } /* + * SLUB reuses PG_workingset bit to keep track of whether it's on + * the per-node partial list. + */ +static inline bool slab_test_node_partial(const struct slab *slab) +{ + return folio_test_workingset((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_node_partial(struct slab *slab) +{ + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + +static inline void slab_clear_node_partial(struct slab *slab) +{ + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); +} + +/* * Management of partially allocated slabs. */ static inline void @@ -2442,6 +2476,7 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); + slab_set_node_partial(slab); } static inline void add_partial(struct kmem_cache_node *n, @@ -2456,11 +2491,12 @@ static inline void remove_partial(struct kmem_cache_node *n, { lockdep_assert_held(&n->list_lock); list_del(&slab->slab_list); + slab_clear_node_partial(slab); n->nr_partial--; } /* - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a * slab from the n->partial list. Remove only a single object from the slab, do * the alloc_debug_processing() checks and leave the slab on the list, or move * it to full list if it was the last free object. @@ -2528,51 +2564,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, return object; } -/* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. - * - * Returns a list of objects or NULL if it fails. - */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab, - int mode) -{ - void *freelist; - unsigned long counters; - struct slab new; - - lockdep_assert_held(&n->list_lock); - - /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. - */ - freelist = slab->freelist; - counters = slab->counters; - new.counters = counters; - if (mode) { - new.inuse = slab->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; - } - - VM_BUG_ON(new.frozen); - new.frozen = 1; - - if (!__slab_update_freelist(s, slab, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) - return NULL; - - remove_partial(n, slab); - WARN_ON(!freelist); - return freelist; -} - #ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); #else @@ -2584,11 +2575,11 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct partial_context *pc) +static struct slab *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2; - void *object = NULL; + struct slab *slab, *slab2, *partial = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2603,27 +2594,25 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { - void *t; - if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - object = alloc_single_from_partial(s, n, slab, + void *object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) + if (object) { + partial = slab; + pc->object = object; break; + } continue; } - t = acquire_slab(s, n, slab, object == NULL); - if (!t) - break; + remove_partial(n, slab); - if (!object) { - *pc->slab = slab; + if (!partial) { + partial = slab; stat(s, ALLOC_FROM_PARTIAL); - object = t; } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); @@ -2639,20 +2628,21 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, } spin_unlock_irqrestore(&n->list_lock, flags); - return object; + return partial; } /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) +static struct slab *get_any_partial(struct kmem_cache *s, + struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - void *object; + struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -2687,8 +2677,8 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, pc); - if (object) { + slab = get_partial_node(s, n, pc); + if (slab) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -2696,7 +2686,7 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) * between allocation and the cpuset * update */ - return object; + return slab; } } } @@ -2708,17 +2698,18 @@ static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) +static struct slab *get_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - void *object; + struct slab *slab; int searchnode = node; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), pc); - if (object || node != NUMA_NO_NODE) - return object; + slab = get_partial_node(s, get_node(s, searchnode), pc); + if (slab || node != NUMA_NO_NODE) + return slab; return get_any_partial(s, pc); } @@ -2807,10 +2798,8 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; - enum slab_modes mode = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; @@ -2848,80 +2837,52 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, /* * Stage two: Unfreeze the slab while splicing the per-cpu * freelist to the head of slab's freelist. - * - * Ensure that the slab is unfrozen while the list presence - * reflects the actual number of objects during unfreeze. - * - * We first perform cmpxchg holding lock and insert to list - * when it succeed. If there is mismatch then the slab is not - * unfrozen and number of objects in the slab may have changed. - * Then release lock and retry cmpxchg again. */ -redo: - - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else - new.freelist = old.freelist; - - new.frozen = 0; + do { + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + new.frozen = 0; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else { + new.freelist = old.freelist; + } + } while (!slab_update_freelist(s, slab, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + /* + * Stage three: Manipulate the slab list based on the updated state. + */ if (!new.inuse && n->nr_partial >= s->min_partial) { - mode = M_FREE; + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, slab); + stat(s, FREE_SLAB); } else if (new.freelist) { - mode = M_PARTIAL; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ spin_lock_irqsave(&n->list_lock, flags); - } else { - mode = M_FULL_NOLIST; - } - - - if (!slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) { - if (mode == M_PARTIAL) - spin_unlock_irqrestore(&n->list_lock, flags); - goto redo; - } - - - if (mode == M_PARTIAL) { add_partial(n, slab, tail); spin_unlock_irqrestore(&n->list_lock, flags); stat(s, tail); - } else if (mode == M_FREE) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } else if (mode == M_FULL_NOLIST) { + } else { stat(s, DEACTIVATE_FULL); } } #ifdef CONFIG_SLUB_CPU_PARTIAL -static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) +static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { struct kmem_cache_node *n = NULL, *n2 = NULL; struct slab *slab, *slab_to_discard = NULL; unsigned long flags = 0; while (partial_slab) { - struct slab new; - struct slab old; - slab = partial_slab; partial_slab = slab->next; @@ -2934,23 +2895,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = slab->freelist; - old.counters = slab->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__slab_update_freelist(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { slab->next = slab_to_discard; slab_to_discard = slab; } else { @@ -2973,9 +2918,9 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) } /* - * Unfreeze all the cpu partial slabs. + * Put all the cpu partial slabs to the node partial list. */ -static void unfreeze_partials(struct kmem_cache *s) +static void put_partials(struct kmem_cache *s) { struct slab *partial_slab; unsigned long flags; @@ -2986,11 +2931,11 @@ static void unfreeze_partials(struct kmem_cache *s) local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } -static void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) +static void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct slab *partial_slab; @@ -2998,12 +2943,11 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, c->partial = NULL; if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } /* - * Put a slab that was just frozen (in __slab_free|get_partial_node) into a - * partial slab slot if available. + * Put a slab into a partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. @@ -3011,7 +2955,7 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { struct slab *oldslab; - struct slab *slab_to_unfreeze = NULL; + struct slab *slab_to_put = NULL; unsigned long flags; int slabs = 0; @@ -3026,7 +2970,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) * per node partial list. Postpone the actual unfreezing * outside of the critical section. */ - slab_to_unfreeze = oldslab; + slab_to_put = oldslab; oldslab = NULL; } else { slabs = oldslab->slabs; @@ -3042,17 +2986,17 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) local_unlock_irqrestore(&s->cpu_slab->lock, flags); - if (slab_to_unfreeze) { - __unfreeze_partials(s, slab_to_unfreeze); + if (slab_to_put) { + __put_partials(s, slab_to_put); stat(s, CPU_PARTIAL_DRAIN); } } #else /* CONFIG_SLUB_CPU_PARTIAL */ -static inline void unfreeze_partials(struct kmem_cache *s) { } -static inline void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } +static inline void put_partials(struct kmem_cache *s) { } +static inline void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } #endif /* CONFIG_SLUB_CPU_PARTIAL */ @@ -3094,7 +3038,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) stat(s, CPUSLAB_FLUSH); } - unfreeze_partials_cpu(s, c); + put_partials_cpu(s, c); } struct slub_flush_work { @@ -3122,7 +3066,7 @@ static void flush_cpu_slab(struct work_struct *w) if (c->slab) flush_slab(s, c); - unfreeze_partials(s); + put_partials(s); } static bool has_cpu_slab(int cpu, struct kmem_cache *s) @@ -3389,6 +3333,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) } /* + * Freeze the partial slab and return the pointer to the freelist. + */ +static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +{ + struct slab new; + unsigned long counters; + void *freelist; + + do { + freelist = slab->freelist; + counters = slab->counters; + + new.counters = counters; + VM_BUG_ON(new.frozen); + + new.inuse = slab->objects; + new.frozen = 1; + + } while (!slab_update_freelist(s, slab, + freelist, counters, + NULL, new.counters, + "freeze_slab")); + + return freelist; +} + +/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * @@ -3430,7 +3401,6 @@ reread_slab: node = NUMA_NO_NODE; goto new_slab; } -redo: if (unlikely(!node_match(slab, node))) { /* @@ -3506,7 +3476,8 @@ deactivate_slab: new_slab: - if (slub_percpu_partial(c)) { +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->slab)) { local_unlock_irqrestore(&s->cpu_slab->lock, flags); @@ -3518,21 +3489,45 @@ new_slab: goto new_objects; } - slab = c->slab = slub_percpu_partial(c); + slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); - goto redo; + + if (unlikely(!node_match(slab, node) || + !pfmemalloc_match(slab, gfpflags))) { + slab->next = NULL; + __put_partials(s, slab); + continue; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; } +#endif new_objects: pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - freelist = get_partial(s, node, &pc); - if (freelist) - goto check_new_slab; + slab = get_partial(s, node, &pc); + if (slab) { + if (kmem_cache_debug(s)) { + freelist = pc.object; + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; + } slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, gfpflags, node); @@ -3568,20 +3563,6 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); -check_new_slab: - - if (kmem_cache_debug(s)) { - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the tracking info - * and return the object - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr); - - return freelist; - } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that @@ -3724,12 +3705,11 @@ static void *__slab_alloc_node(struct kmem_cache *s, void *object; pc.flags = gfpflags; - pc.slab = &slab; pc.orig_size = orig_size; - object = get_partial(s, node, &pc); + slab = get_partial(s, node, &pc); - if (object) - return object; + if (slab) + return pc.object; slab = new_slab(s, gfpflags, node); if (unlikely(!slab)) { @@ -4119,6 +4099,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long counters; struct kmem_cache_node *n = NULL; unsigned long flags; + bool on_node_partial; stat(s, FREE_SLOWPATH); @@ -4139,18 +4120,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, was_frozen = new.frozen; new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { - - if (kmem_cache_has_cpu_partial(s) && !prior) { - - /* - * Slab was on no list before and will be - * partially empty - * We can defer the list move and instead - * freeze it. - */ - new.frozen = 1; - - } else { /* Needs to be taken off a list */ + /* Needs to be taken off a list */ + if (!kmem_cache_has_cpu_partial(s) || prior) { n = get_node(s, slab_nid(slab)); /* @@ -4163,6 +4134,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, */ spin_lock_irqsave(&n->list_lock, flags); + on_node_partial = slab_test_node_partial(slab); } } @@ -4179,9 +4151,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (new.frozen) { + } else if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * If we just froze the slab then put it onto the + * If we started with a full slab then put it onto the * per cpu partial list. */ put_cpu_partial(s, slab, 1); @@ -4191,6 +4163,15 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } + /* + * This slab was partially empty but not on the per-node partial list, + * in which case we shouldn't manipulate its list, just return. + */ + if (prior && !on_node_partial) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; @@ -5446,6 +5427,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); + slab_clear_node_partial(slab); n->nr_partial--; dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) |