diff options
author | Alexei Starovoitov <[email protected]> | 2023-09-05 17:47:54 -0700 |
---|---|---|
committer | Alexei Starovoitov <[email protected]> | 2023-09-08 08:42:19 -0700 |
commit | 35897c3c5264469cbd45e44d23c165a08cfb6b3c (patch) | |
tree | eefb00f136148f550bbad6d8b82e4fe75b4215c5 | |
parent | 1e4a6d975e5cd114509aa447750d68d295a501a7 (diff) | |
parent | 29c11aa8082b6dbef2cffbcd5e81be27e9b50a5b (diff) |
Merge branch 'bpf-enable-irq-after-irq_work_raise-completes'
Hou Tao says:
====================
bpf: Enable IRQ after irq_work_raise() completes
From: Hou Tao <[email protected]>
Hi,
The patchset aims to fix the problem that bpf_mem_alloc() may return
NULL unexpectedly when multiple bpf_mem_alloc() are invoked concurrently
under process context and there is still free memory available. The
problem was found when doing stress test for qp-trie but the same
problem also exists for bpf_obj_new() as demonstrated in patch #3.
As pointed out by Alexei, the patchset can only fix ENOMEM problem for
normal process context and can not fix the problem for irq-disabled
context or RT-enabled kernel.
Patch #1 fixes the race between unit_alloc() and unit_alloc(). Patch #2
fixes the race between unit_alloc() and unit_free(). And patch #3 adds
a selftest for the problem. The major change compared with v1 is using
local_irq_{save,restore)() pair to disable and enable preemption
instead of preempt_{disable,enable}_notrace pair. The main reason is to
prevent potential overhead from __preempt_schedule_notrace(). I also
run htab_mem benchmark and hash_map_perf on a 8-CPUs KVM VM to compare
the performance between local_irq_{save,restore} and
preempt_{disable,enable}_notrace(), but the results are similar as shown
below:
(1) use preempt_{disable,enable}_notrace()
[root@hello bpf]# ./map_perf_test 4 8
0:hash_map_perf kmalloc 652179 events per sec
1:hash_map_perf kmalloc 651880 events per sec
2:hash_map_perf kmalloc 651382 events per sec
3:hash_map_perf kmalloc 650791 events per sec
5:hash_map_perf kmalloc 650140 events per sec
6:hash_map_perf kmalloc 652773 events per sec
7:hash_map_perf kmalloc 652751 events per sec
4:hash_map_perf kmalloc 648199 events per sec
[root@hello bpf]# ./benchs/run_bench_htab_mem.sh
normal bpf ma
=============
overwrite per-prod-op: 110.82 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.73MiB
batch_add_batch_del per-prod-op: 89.79 ± 0.75k/s, avg mem: 1.68 ± 0.38MiB, peak mem: 2.73MiB
add_del_on_diff_cpu per-prod-op: 17.83 ± 0.07k/s, avg mem: 25.68 ± 2.92MiB, peak mem: 35.10MiB
(2) use local_irq_{save,restore}
[root@hello bpf]# ./map_perf_test 4 8
0:hash_map_perf kmalloc 656299 events per sec
1:hash_map_perf kmalloc 656397 events per sec
2:hash_map_perf kmalloc 656046 events per sec
3:hash_map_perf kmalloc 655723 events per sec
5:hash_map_perf kmalloc 655221 events per sec
4:hash_map_perf kmalloc 654617 events per sec
6:hash_map_perf kmalloc 650269 events per sec
7:hash_map_perf kmalloc 653665 events per sec
[root@hello bpf]# ./benchs/run_bench_htab_mem.sh
normal bpf ma
=============
overwrite per-prod-op: 116.10 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.74MiB
batch_add_batch_del per-prod-op: 88.76 ± 0.61k/s, avg mem: 1.94 ± 0.33MiB, peak mem: 2.74MiB
add_del_on_diff_cpu per-prod-op: 18.12 ± 0.08k/s, avg mem: 25.10 ± 2.70MiB, peak mem: 34.78MiB
As ususal comments are always welcome.
Change Log:
v2:
* Use local_irq_save to disable preemption instead of using
preempt_{disable,enable}_notrace pair to prevent potential overhead
v1: https://lore.kernel.org/bpf/[email protected]/
====================
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Alexei Starovoitov <[email protected]>
-rw-r--r-- | kernel/bpf/memalloc.c | 16 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c | 89 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c | 106 |
3 files changed, 208 insertions, 3 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index cb60445de98a..961df89d45f1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } @@ -773,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -795,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. diff --git a/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c b/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c new file mode 100644 index 000000000000..3a2ec3923fca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/preempted_bpf_ma_op.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include <sched.h> +#include <pthread.h> +#include <stdbool.h> +#include <test_progs.h> + +#include "preempted_bpf_ma_op.skel.h" + +#define ALLOC_THREAD_NR 4 +#define ALLOC_LOOP_NR 512 + +struct alloc_ctx { + /* output */ + int run_err; + /* input */ + int fd; + bool *nomem_err; +}; + +static void *run_alloc_prog(void *data) +{ + struct alloc_ctx *ctx = data; + cpu_set_t cpu_set; + int i; + + CPU_ZERO(&cpu_set); + CPU_SET(0, &cpu_set); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); + + for (i = 0; i < ALLOC_LOOP_NR && !*ctx->nomem_err; i++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err; + + err = bpf_prog_test_run_opts(ctx->fd, &topts); + ctx->run_err |= err | topts.retval; + } + + return NULL; +} + +void test_preempted_bpf_ma_op(void) +{ + struct alloc_ctx ctx[ALLOC_THREAD_NR]; + struct preempted_bpf_ma_op *skel; + pthread_t tid[ALLOC_THREAD_NR]; + int i, err; + + skel = preempted_bpf_ma_op__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = preempted_bpf_ma_op__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto out; + + for (i = 0; i < ARRAY_SIZE(ctx); i++) { + struct bpf_program *prog; + char name[8]; + + snprintf(name, sizeof(name), "test%d", i); + prog = bpf_object__find_program_by_name(skel->obj, name); + if (!ASSERT_OK_PTR(prog, "no test prog")) + goto out; + + ctx[i].run_err = 0; + ctx[i].fd = bpf_program__fd(prog); + ctx[i].nomem_err = &skel->bss->nomem_err; + } + + memset(tid, 0, sizeof(tid)); + for (i = 0; i < ARRAY_SIZE(tid); i++) { + err = pthread_create(&tid[i], NULL, run_alloc_prog, &ctx[i]); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + for (i = 0; i < ARRAY_SIZE(tid); i++) { + if (!tid[i]) + break; + pthread_join(tid[i], NULL); + ASSERT_EQ(ctx[i].run_err, 0, "run prog err"); + } + + ASSERT_FALSE(skel->bss->nomem_err, "ENOMEM"); +out: + preempted_bpf_ma_op__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c b/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c new file mode 100644 index 000000000000..55907ef961bf --- /dev/null +++ b/tools/testing/selftests/bpf/progs/preempted_bpf_ma_op.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> + +#include "bpf_experimental.h" + +struct bin_data { + char data[256]; + struct bpf_spin_lock lock; +}; + +struct map_value { + struct bin_data __kptr * data; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 2048); +} array SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +bool nomem_err = false; + +static int del_array(unsigned int i, int *from) +{ + struct map_value *value; + struct bin_data *old; + + value = bpf_map_lookup_elem(&array, from); + if (!value) + return 1; + + old = bpf_kptr_xchg(&value->data, NULL); + if (old) + bpf_obj_drop(old); + + (*from)++; + return 0; +} + +static int add_array(unsigned int i, int *from) +{ + struct bin_data *old, *new; + struct map_value *value; + + value = bpf_map_lookup_elem(&array, from); + if (!value) + return 1; + + new = bpf_obj_new(typeof(*new)); + if (!new) { + nomem_err = true; + return 1; + } + + old = bpf_kptr_xchg(&value->data, new); + if (old) + bpf_obj_drop(old); + + (*from)++; + return 0; +} + +static void del_then_add_array(int from) +{ + int i; + + i = from; + bpf_loop(512, del_array, &i, 0); + + i = from; + bpf_loop(512, add_array, &i, 0); +} + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG2(test0, int, a) +{ + del_then_add_array(0); + return 0; +} + +SEC("fentry/bpf_fentry_test2") +int BPF_PROG2(test1, int, a, u64, b) +{ + del_then_add_array(512); + return 0; +} + +SEC("fentry/bpf_fentry_test3") +int BPF_PROG2(test2, char, a, int, b, u64, c) +{ + del_then_add_array(1024); + return 0; +} + +SEC("fentry/bpf_fentry_test4") +int BPF_PROG2(test3, void *, a, char, b, int, c, u64, d) +{ + del_then_add_array(1536); + return 0; +} |