diff options
author | Daniel Borkmann <daniel@iogearbox.net> | 2020-08-28 21:20:33 +0200 |
---|---|---|
committer | Daniel Borkmann <daniel@iogearbox.net> | 2020-08-28 21:20:38 +0200 |
commit | 10496f261ed30592c6a7f8315f6b5ec055db624a (patch) | |
tree | 0751db49ba3afe3976061f9c6fd7ed2c44b6a5fe | |
parent | d557ea39a5f894630c403b78703ac92b08b7dd62 (diff) | |
parent | e68a144547fc7a956952260539cb7b8bb9afbcc0 (diff) |
Merge branch 'bpf-sleepable'
Alexei Starovoitov says:
====================
v2->v3:
- switched to minimal allowlist approach. Essentially that means that syscall
entry, few btrfs allow_error_inject functions, should_fail_bio(), and two LSM
hooks: file_mprotect and bprm_committed_creds are the only hooks that allow
attaching of sleepable BPF programs. When comprehensive analysis of LSM hooks
will be done this allowlist will be extended.
- added patch 1 that fixes prototypes of two mm functions to reliably work with
error injection. It's also necessary for resolve_btfids tool to recognize
these two funcs, but that's secondary.
v1->v2:
- split fmod_ret fix into separate patch
- added denylist
v1:
This patch set introduces the minimal viable support for sleepable bpf programs.
In this patch only fentry/fexit/fmod_ret and lsm progs can be sleepable.
Only array and pre-allocated hash and lru maps allowed.
Here is 'perf report' difference of sleepable vs non-sleepable:
3.86% bench [k] __srcu_read_unlock
3.22% bench [k] __srcu_read_lock
0.92% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep
0.50% bench [k] bpf_trampoline_10297
0.26% bench [k] __bpf_prog_exit_sleepable
0.21% bench [k] __bpf_prog_enter_sleepable
vs
0.88% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry
0.84% bench [k] bpf_trampoline_10297
0.13% bench [k] __bpf_prog_enter
0.12% bench [k] __bpf_prog_exit
vs
0.79% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep
0.72% bench [k] bpf_trampoline_10381
0.31% bench [k] __bpf_prog_exit_sleepable
0.29% bench [k] __bpf_prog_enter_sleepable
Sleepable vs non-sleepable program invocation overhead is only marginally higher
due to rcu_trace. srcu approach is much slower.
====================
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 32 | ||||
-rw-r--r-- | include/linux/bpf.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/bpf.h | 16 | ||||
-rw-r--r-- | init/Kconfig | 1 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 1 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 12 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 22 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 13 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 28 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 81 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 2 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 16 | ||||
-rw-r--r-- | tools/lib/bpf/libbpf.c | 25 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/bench.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/benchs/bench_trigger.c | 17 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/test_lsm.c | 9 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/lsm.c | 66 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/trigger_bench.c | 7 |
20 files changed, 331 insertions, 33 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 42b6709e6dc7..7d9ea7b41c71 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + return -EINVAL; + } else { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + } /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + return -EINVAL; + } else { + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, + (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } *pprog = prog; return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbba82a80087..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -734,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -1781,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef7af384f5ee..a613750d5515 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * @@ -3561,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3711,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/init/Kconfig b/init/Kconfig index fc10f7ede5f6..6ecc00e130ff 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1691,6 +1691,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_TRACE_RCU default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d851ebbcf302..e046fb7d17cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,6 +10,7 @@ #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "map_in_map.h" diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad80f45774e7..fe0e06284d33 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..5cc7425ee476 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b86b1155b748..4108ef3b828b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -29,6 +29,7 @@ #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/bpf-netns.h> +#include <linux/rcupdate_trace.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..c2b76545153c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> +#include <linux/rcupdate_trace.h> +#include <linux/rcupdate_wait.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f5a9f51cc03..3ebfdb7bd427 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include <linux/ctype.h> #include <linux/error-injection.h> #include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> #include "disasm.h" @@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) return -EINVAL; } +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_file_mprotect) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); @@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) + verbose(env, "%s is not sleepable\n", + prog->aux->attach_func_name); + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { ret = check_attach_modify_return(prog, addr); if (ret) verbose(env, "%s() is not modifiable\n", prog->aux->attach_func_name); } - if (ret) goto out; tr->func.addr = (void *)addr; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d973d891f2e2..b2a5380eb187 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1228,6 +1228,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; default: return NULL; } diff --git a/mm/filemap.c b/mm/filemap.c index 1aaea26556cc..054d93a86f8a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -827,10 +827,10 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -static int __add_to_page_cache_locked(struct page *page, - struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask, - void **shadowp) +noinline int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e2bab486fea..cd8d8f0326fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3477,7 +3477,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ -static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ef7af384f5ee..a613750d5515 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * @@ -3561,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3711,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8cdb2528482e..b688aadf09c5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -208,6 +208,7 @@ struct bpf_sec_def { bool is_exp_attach_type_optional; bool is_attachable; bool is_attach_btf; + bool is_sleepable; attach_fn_t attach_fn; }; @@ -6291,6 +6292,8 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, /* couldn't guess, but user might manually specify */ continue; + if (prog->sec_def->is_sleepable) + prog->prog_flags |= BPF_F_SLEEPABLE; bpf_program__set_type(prog, prog->sec_def->prog_type); bpf_program__set_expected_attach_type(prog, prog->sec_def->expected_attach_type); @@ -7559,6 +7562,21 @@ static const struct bpf_sec_def section_defs[] = { .expected_attach_type = BPF_TRACE_FEXIT, .is_attach_btf = true, .attach_fn = attach_trace), + SEC_DEF("fentry.s/", TRACING, + .expected_attach_type = BPF_TRACE_FENTRY, + .is_attach_btf = true, + .is_sleepable = true, + .attach_fn = attach_trace), + SEC_DEF("fmod_ret.s/", TRACING, + .expected_attach_type = BPF_MODIFY_RETURN, + .is_attach_btf = true, + .is_sleepable = true, + .attach_fn = attach_trace), + SEC_DEF("fexit.s/", TRACING, + .expected_attach_type = BPF_TRACE_FEXIT, + .is_attach_btf = true, + .is_sleepable = true, + .attach_fn = attach_trace), SEC_DEF("freplace/", EXT, .is_attach_btf = true, .attach_fn = attach_trace), @@ -7566,6 +7584,11 @@ static const struct bpf_sec_def section_defs[] = { .is_attach_btf = true, .expected_attach_type = BPF_LSM_MAC, .attach_fn = attach_lsm), + SEC_DEF("lsm.s/", LSM, + .is_attach_btf = true, + .is_sleepable = true, + .expected_attach_type = BPF_LSM_MAC, + .attach_fn = attach_lsm), SEC_DEF("iter/", TRACING, .expected_attach_type = BPF_TRACE_ITER, .is_attach_btf = true, @@ -8288,7 +8311,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, prog->prog_ifindex = attr->ifindex; prog->log_level = attr->log_level; - prog->prog_flags = attr->prog_flags; + prog->prog_flags |= attr->prog_flags; if (!first_prog) first_prog = prog; } diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 944ad4721c83..1a427685a8a8 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -317,6 +317,7 @@ extern const struct bench bench_trig_tp; extern const struct bench bench_trig_rawtp; extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_fentry; +extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; @@ -338,6 +339,7 @@ static const struct bench *benchs[] = { &bench_trig_rawtp, &bench_trig_kprobe, &bench_trig_fentry, + &bench_trig_fentry_sleep, &bench_trig_fmodret, &bench_rb_libbpf, &bench_rb_custom, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 49c22832f216..2a0b6c9885a4 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -90,6 +90,12 @@ static void trigger_fentry_setup() attach_bpf(ctx.skel->progs.bench_trigger_fentry); } +static void trigger_fentry_sleep_setup() +{ + setup_ctx(); + attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep); +} + static void trigger_fmodret_setup() { setup_ctx(); @@ -155,6 +161,17 @@ const struct bench bench_trig_fentry = { .report_final = hits_drops_report_final, }; +const struct bench bench_trig_fentry_sleep = { + .name = "trig-fentry-sleep", + .validate = trigger_validate, + .setup = trigger_fentry_sleep_setup, + .producer_thread = trigger_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + const struct bench bench_trig_fmodret = { .name = "trig-fmodret", .validate = trigger_validate, diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index b17eb2045c1d..6ab29226c99b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -10,6 +10,7 @@ #include <unistd.h> #include <malloc.h> #include <stdlib.h> +#include <unistd.h> #include "lsm.skel.h" @@ -55,6 +56,7 @@ void test_test_lsm(void) { struct lsm *skel = NULL; int err, duration = 0; + int buf = 1234; skel = lsm__open_and_load(); if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) @@ -81,6 +83,13 @@ void test_test_lsm(void) CHECK(skel->bss->mprotect_count != 1, "mprotect_count", "mprotect_count = %d\n", skel->bss->mprotect_count); + syscall(__NR_setdomainname, &buf, -2L); + syscall(__NR_setdomainname, 0, -3L); + syscall(__NR_setdomainname, ~0L, -4L); + + CHECK(skel->bss->copy_test != 3, "copy_test", + "copy_test = %d\n", skel->bss->copy_test); + close_prog: lsm__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c index b4598d4bc4f7..49fa6ca99755 100644 --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -9,16 +9,41 @@ #include <bpf/bpf_tracing.h> #include <errno.h> +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} hash SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} lru_hash SEC(".maps"); + char _license[] SEC("license") = "GPL"; int monitored_pid = 0; int mprotect_count = 0; int bprm_count = 0; -SEC("lsm/file_mprotect") +SEC("lsm.s/file_mprotect") int BPF_PROG(test_int_hook, struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot, int ret) { + char args[64]; + __u32 key = 0; + __u64 *value; + if (ret != 0) return ret; @@ -28,6 +53,18 @@ int BPF_PROG(test_int_hook, struct vm_area_struct *vma, is_stack = (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack); + bpf_copy_from_user(args, sizeof(args), (void *)vma->vm_mm->arg_start); + + value = bpf_map_lookup_elem(&array, &key); + if (value) + *value = 0; + value = bpf_map_lookup_elem(&hash, &key); + if (value) + *value = 0; + value = bpf_map_lookup_elem(&lru_hash, &key); + if (value) + *value = 0; + if (is_stack && monitored_pid == pid) { mprotect_count++; ret = -EPERM; @@ -36,7 +73,7 @@ int BPF_PROG(test_int_hook, struct vm_area_struct *vma, return ret; } -SEC("lsm/bprm_committed_creds") +SEC("lsm.s/bprm_committed_creds") int BPF_PROG(test_void_hook, struct linux_binprm *bprm) { __u32 pid = bpf_get_current_pid_tgid() >> 32; @@ -46,3 +83,28 @@ int BPF_PROG(test_void_hook, struct linux_binprm *bprm) return 0; } +SEC("lsm/task_free") /* lsm/ is ok, lsm.s/ fails */ +int BPF_PROG(test_task_free, struct task_struct *task) +{ + return 0; +} + +int copy_test = 0; + +SEC("fentry.s/__x64_sys_setdomainname") +int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs) +{ + void *ptr = (void *)PT_REGS_PARM1(regs); + int len = PT_REGS_PARM2(regs); + int buf = 0; + long ret; + + ret = bpf_copy_from_user(&buf, sizeof(buf), ptr); + if (len == -2 && ret == 0 && buf == 1234) + copy_test++; + if (len == -3 && ret == -EFAULT) + copy_test++; + if (len == -4 && ret == -EFAULT) + copy_test++; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 8b36b6640e7e..9a4d09590b3d 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -39,6 +39,13 @@ int bench_trigger_fentry(void *ctx) return 0; } +SEC("fentry.s/__x64_sys_getpgid") +int bench_trigger_fentry_sleep(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} + SEC("fmod_ret/__x64_sys_getpgid") int bench_trigger_fmodret(void *ctx) { |