diff options
author | Alexei Starovoitov <[email protected]> | 2024-01-03 21:08:26 -0800 |
---|---|---|
committer | Alexei Starovoitov <[email protected]> | 2024-01-03 21:08:27 -0800 |
commit | f8506c5734902ebda5c7b4778859b46d0a2ae5f3 (patch) | |
tree | 33e7bd54cb6f7ee386fa8abca5ab0ae9cea74d64 /kernel/bpf/memalloc.c | |
parent | 417fa6d163df6f13fb2cfad5132eff354c8a472e (diff) | |
parent | adc8c4549d9e74d2359c217d2478b18ecdd15c91 (diff) |
Merge branch 'bpf-reduce-memory-usage-for-bpf_global_percpu_ma'
Yonghong Song says:
====================
bpf: Reduce memory usage for bpf_global_percpu_ma
Currently when a bpf program intends to allocate memory for percpu kptr,
the verifier will call bpf_mem_alloc_init() to prefill all supported
unit sizes and this caused memory consumption very big for large number
of cpus. For example, for 128-cpu system, the total memory consumption
with initial prefill is ~175MB. Things will become worse for systems
with even more cpus.
Patch 1 avoids unnecessary extra percpu memory allocation.
Patch 2 adds objcg to bpf_mem_alloc at init stage so objcg can be
associated with root cgroup and objcg can be passed to later
bpf_mem_alloc_percpu_unit_init().
Patch 3 addresses memory consumption issue by avoiding to prefill
with all unit sizes, i.e. only prefilling with user specified size.
Patch 4 further reduces memory consumption by limiting the
number of prefill entries for percpu memory allocation.
Patch 5 has much smaller low/high watermarks for percpu allocation
to reduce memory consumption.
Patch 6 rejects percpu memory allocation with bpf_global_percpu_ma
when allocation size is greater than 512 bytes.
Patch 7 fixed test_bpf_ma test due to Patch 5.
Patch 8 added one test to show the verification failure log message.
Changelogs:
v5 -> v6:
. Change bpf_mem_alloc_percpu_init() to add objcg as one of parameters.
For bpf_global_percpu_ma, the objcg is NULL, corresponding root memcg.
v4 -> v5:
. Do not do bpf_global_percpu_ma initialization at init stage, instead
doing initialization when the verifier knows it is going to be used
by bpf prog.
. Using much smaller low/high watermarks for percpu allocation.
v3 -> v4:
. Add objcg to bpf_mem_alloc during init stage.
. Initialize objcg at init stage but use it in bpf_mem_alloc_percpu_unit_init().
. Remove check_obj_size() in bpf_mem_alloc_percpu_unit_init().
v2 -> v3:
. Clear the bpf_mem_cache if prefill fails.
. Change test_bpf_ma percpu allocation tests to use bucket_size
as allocation size instead of bucket_size - 8.
. Remove __GFP_ZERO flag from __alloc_percpu_gfp() call.
v1 -> v2:
. Avoid unnecessary extra percpu memory allocation.
. Add a separate function to do bpf_global_percpu_ma initialization
. promote.
. Promote function static 'sizes' array to file static.
. Add comments to explain to refill only one item for percpu alloc.
====================
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Alexei Starovoitov <[email protected]>
Diffstat (limited to 'kernel/bpf/memalloc.c')
-rw-r--r-- | kernel/bpf/memalloc.c | 93 |
1 files changed, 81 insertions, 12 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index aa0fbf000a12..460c8f38fed6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,6 +121,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -462,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. + * + * Percpu allocation is typically rare. To avoid potential unnecessary large + * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); - if (c->unit_size <= 256) { + if (c->percpu_size) { + c->low_watermark = 1; + c->high_watermark = 3; + } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { @@ -483,11 +491,16 @@ static void init_refill_work(struct bpf_mem_cache *c) static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) { - /* To avoid consuming memory assume that 1st run of bpf - * prog won't be doing more than 4 map_update_elem from - * irq disabled region + int cnt = 1; + + /* To avoid consuming memory, for non-percpu allocation, assume that + * 1st run of bpf prog won't be doing more than 4 map_update_elem from + * irq disabled region if unit size is less than or equal to 256. + * For all other cases, let us just do one allocation. */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); + if (!c->percpu_size && c->unit_size <= 256) + cnt = 4; + alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. @@ -499,12 +512,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + /* room for llist_node and per-cpu pointer */ if (percpu) percpu_size = LLIST_NODE_SZ + sizeof(void *); @@ -523,6 +538,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; @@ -542,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { @@ -560,6 +578,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (cpu == 0 && c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + + return 0; +} + static void drain_mem_cache(struct bpf_mem_cache *c) { bool percpu = !!c->percpu_size; @@ -691,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -709,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } @@ -833,7 +900,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) if (!size) return NULL; - idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (!ma->percpu) + size += LLIST_NODE_SZ; + idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; |