aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2020-08-28 21:20:33 +0200
committerDaniel Borkmann <daniel@iogearbox.net>2020-08-28 21:20:38 +0200
commit10496f261ed30592c6a7f8315f6b5ec055db624a (patch)
tree0751db49ba3afe3976061f9c6fd7ed2c44b6a5fe
parentd557ea39a5f894630c403b78703ac92b08b7dd62 (diff)
parente68a144547fc7a956952260539cb7b8bb9afbcc0 (diff)
Merge branch 'bpf-sleepable'
Alexei Starovoitov says: ==================== v2->v3: - switched to minimal allowlist approach. Essentially that means that syscall entry, few btrfs allow_error_inject functions, should_fail_bio(), and two LSM hooks: file_mprotect and bprm_committed_creds are the only hooks that allow attaching of sleepable BPF programs. When comprehensive analysis of LSM hooks will be done this allowlist will be extended. - added patch 1 that fixes prototypes of two mm functions to reliably work with error injection. It's also necessary for resolve_btfids tool to recognize these two funcs, but that's secondary. v1->v2: - split fmod_ret fix into separate patch - added denylist v1: This patch set introduces the minimal viable support for sleepable bpf programs. In this patch only fentry/fexit/fmod_ret and lsm progs can be sleepable. Only array and pre-allocated hash and lru maps allowed. Here is 'perf report' difference of sleepable vs non-sleepable: 3.86% bench [k] __srcu_read_unlock 3.22% bench [k] __srcu_read_lock 0.92% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep 0.50% bench [k] bpf_trampoline_10297 0.26% bench [k] __bpf_prog_exit_sleepable 0.21% bench [k] __bpf_prog_enter_sleepable vs 0.88% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry 0.84% bench [k] bpf_trampoline_10297 0.13% bench [k] __bpf_prog_enter 0.12% bench [k] __bpf_prog_exit vs 0.79% bench [k] bpf_prog_740d4210cdcd99a3_bench_trigger_fentry_sleep 0.72% bench [k] bpf_trampoline_10381 0.31% bench [k] __bpf_prog_exit_sleepable 0.29% bench [k] __bpf_prog_enter_sleepable Sleepable vs non-sleepable program invocation overhead is only marginally higher due to rcu_trace. srcu approach is much slower. ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
-rw-r--r--arch/x86/net/bpf_jit_comp.c32
-rw-r--r--include/linux/bpf.h4
-rw-r--r--include/uapi/linux/bpf.h16
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/bpf/arraymap.c1
-rw-r--r--kernel/bpf/hashtab.c12
-rw-r--r--kernel/bpf/helpers.c22
-rw-r--r--kernel/bpf/syscall.c13
-rw-r--r--kernel/bpf/trampoline.c28
-rw-r--r--kernel/bpf/verifier.c81
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/page_alloc.c2
-rw-r--r--tools/include/uapi/linux/bpf.h16
-rw-r--r--tools/lib/bpf/libbpf.c25
-rw-r--r--tools/testing/selftests/bpf/bench.c2
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_trigger.c17
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_lsm.c9
-rw-r--r--tools/testing/selftests/bpf/progs/lsm.c66
-rw-r--r--tools/testing/selftests/bpf/progs/trigger_bench.c7
20 files changed, 331 insertions, 33 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 42b6709e6dc7..7d9ea7b41c71 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
u8 *prog = *pprog;
int cnt = 0;
- if (emit_call(&prog, __bpf_prog_enter, prog))
- return -EINVAL;
- /* remember prog start time returned by __bpf_prog_enter */
- emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+ if (p->aux->sleepable) {
+ if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
+ return -EINVAL;
+ } else {
+ if (emit_call(&prog, __bpf_prog_enter, prog))
+ return -EINVAL;
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+ }
/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- /* arg1: mov rdi, progs[i] */
- emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
- (u32) (long) p);
- /* arg2: mov rsi, rbx <- start time in nsec */
- emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
- if (emit_call(&prog, __bpf_prog_exit, prog))
- return -EINVAL;
+ if (p->aux->sleepable) {
+ if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
+ return -EINVAL;
+ } else {
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
+ (u32) (long) p);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog, __bpf_prog_exit, prog))
+ return -EINVAL;
+ }
*pprog = prog;
return 0;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dbba82a80087..c6d9f2c444f4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
+void notrace __bpf_prog_enter_sleepable(void);
+void notrace __bpf_prog_exit_sleepable(void);
struct bpf_ksym {
unsigned long start;
@@ -734,6 +736,7 @@ struct bpf_prog_aux {
bool offload_requested;
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
bool func_proto_unreliable;
+ bool sleepable;
enum bpf_tramp_prog_type trampoline_prog_type;
struct bpf_trampoline *trampoline;
struct hlist_node tramp_hlist;
@@ -1781,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
+extern const struct bpf_func_proto bpf_copy_from_user_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef7af384f5ee..a613750d5515 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -346,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -3561,6 +3569,13 @@ union bpf_attr {
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
* value.
+ *
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
+ * Description
+ * Read *size* bytes from user space address *user_ptr* and store
+ * the data in *dst*. This is a wrapper of copy_from_user().
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3711,6 +3726,7 @@ union bpf_attr {
FN(inode_storage_get), \
FN(inode_storage_delete), \
FN(d_path), \
+ FN(copy_from_user), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/init/Kconfig b/init/Kconfig
index fc10f7ede5f6..6ecc00e130ff 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1691,6 +1691,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
+ select TASKS_TRACE_RCU
default n
help
Enable the bpf() system call that allows to manipulate eBPF
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d851ebbcf302..e046fb7d17cd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -10,6 +10,7 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "map_in_map.h"
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ad80f45774e7..fe0e06284d33 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
@@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
@@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
u32 hash, key_size;
int ret = -ENOENT;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
key_size = map->key_size;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index be43ab3e619f..5cc7425ee476 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
+ const void __user *, user_ptr)
+{
+ int ret = copy_from_user(dst, user_ptr, size);
+
+ if (unlikely(ret)) {
+ memset(dst, 0, size);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_proto = {
+ .func = bpf_copy_from_user,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b86b1155b748..4108ef3b828b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -29,6 +29,7 @@
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
+#include <linux/rcupdate_trace.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
btf_put(prog->aux->btf);
bpf_prog_free_linfo(prog);
- if (deferred)
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
- else
+ if (deferred) {
+ if (prog->aux->sleepable)
+ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ } else {
__bpf_prog_put_rcu(&prog->aux->rcu);
+ }
}
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
@@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
+ BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}
prog->aux->offload_requested = !!attr->prog_ifindex;
+ prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 9be85aa4ec5f..c2b76545153c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -7,6 +7,8 @@
#include <linux/rbtree_latch.h>
#include <linux/perf_event.h>
#include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
* updates to trampoline would change the code from underneath the
* preempted task. Hence wait for tasks to voluntarily schedule or go
* to userspace.
+ * The same trampoline can hold both sleepable and non-sleepable progs.
+ * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
+ * programs finish executing.
+ * Wait for these two grace periods together.
*/
-
- synchronize_rcu_tasks();
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
&tr->func.model, flags, tprogs,
@@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
bpf_image_ksym_del(&tr->ksym);
- /* wait for tasks to get out of trampoline before freeing it */
+ /* This code will be executed when all bpf progs (both sleepable and
+ * non-sleepable) went through
+ * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
+ * Hence no need for another synchronize_rcu_tasks_trace() here,
+ * but synchronize_rcu_tasks() is still needed, since trampoline
+ * may not have had any sleepable programs and we need to wait
+ * for tasks to get out of trampoline code before freeing it.
+ */
synchronize_rcu_tasks();
bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
@@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
rcu_read_unlock();
}
+void notrace __bpf_prog_enter_sleepable(void)
+{
+ rcu_read_lock_trace();
+}
+
+void notrace __bpf_prog_exit_sleepable(void)
+{
+ rcu_read_unlock_trace();
+}
+
int __weak
arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6f5a9f51cc03..3ebfdb7bd427 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,7 @@
#include <linux/ctype.h>
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
#include "disasm.h"
@@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
+ if (prog->aux->sleepable)
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_ARRAY:
+ if (!is_preallocated_map(map)) {
+ verbose(env,
+ "Sleepable programs can only use preallocated hash maps\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ verbose(env,
+ "Sleepable programs can only use array and hash maps\n");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
return -EINVAL;
}
+/* non exhaustive list of sleepable bpf_lsm_*() functions */
+BTF_SET_START(btf_sleepable_lsm_hooks)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_lsm_file_mprotect)
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+#endif
+BTF_SET_END(btf_sleepable_lsm_hooks)
+
+static int check_sleepable_lsm_hook(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
+}
+
+/* list of non-sleepable functions that are otherwise on
+ * ALLOW_ERROR_INJECTION list
+ */
+BTF_SET_START(btf_non_sleepable_error_inject)
+/* Three functions below can be called from sleepable and non-sleepable context.
+ * Assume non-sleepable from bpf safety point of view.
+ */
+BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, should_fail_alloc_page)
+BTF_ID(func, should_failslab)
+BTF_SET_END(btf_non_sleepable_error_inject)
+
+static int check_non_sleepable_error_inject(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
+}
+
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
@@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
+ if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ return -EINVAL;
+ }
+
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
return check_struct_ops_btf_id(env);
@@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
}
- if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+ if (prog->aux->sleepable) {
+ ret = -EINVAL;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ ret = 0;
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
+ * Only some of them are sleepable.
+ */
+ if (check_sleepable_lsm_hook(btf_id))
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+ if (ret)
+ verbose(env, "%s is not sleepable\n",
+ prog->aux->attach_func_name);
+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
ret = check_attach_modify_return(prog, addr);
if (ret)
verbose(env, "%s() is not modifiable\n",
prog->aux->attach_func_name);
}
-
if (ret)
goto out;
tr->func.addr = (void *)addr;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d973d891f2e2..b2a5380eb187 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1228,6 +1228,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
return &bpf_get_task_stack_proto;
+ case BPF_FUNC_copy_from_user:
+ return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
default:
return NULL;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 1aaea26556cc..054d93a86f8a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -827,10 +827,10 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e2bab486fea..cd8d8f0326fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3477,7 +3477,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ef7af384f5ee..a613750d5515 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -346,6 +346,14 @@ enum bpf_link_type {
/* The verifier internal test flag. Behavior is undefined */
#define BPF_F_TEST_STATE_FREQ (1U << 3)
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -3561,6 +3569,13 @@ union bpf_attr {
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
* value.
+ *
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
+ * Description
+ * Read *size* bytes from user space address *user_ptr* and store
+ * the data in *dst*. This is a wrapper of copy_from_user().
+ * Return
+ * 0 on success, or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3711,6 +3726,7 @@ union bpf_attr {
FN(inode_storage_get), \
FN(inode_storage_delete), \
FN(d_path), \
+ FN(copy_from_user), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8cdb2528482e..b688aadf09c5 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -208,6 +208,7 @@ struct bpf_sec_def {
bool is_exp_attach_type_optional;
bool is_attachable;
bool is_attach_btf;
+ bool is_sleepable;
attach_fn_t attach_fn;
};
@@ -6291,6 +6292,8 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
/* couldn't guess, but user might manually specify */
continue;
+ if (prog->sec_def->is_sleepable)
+ prog->prog_flags |= BPF_F_SLEEPABLE;
bpf_program__set_type(prog, prog->sec_def->prog_type);
bpf_program__set_expected_attach_type(prog,
prog->sec_def->expected_attach_type);
@@ -7559,6 +7562,21 @@ static const struct bpf_sec_def section_defs[] = {
.expected_attach_type = BPF_TRACE_FEXIT,
.is_attach_btf = true,
.attach_fn = attach_trace),
+ SEC_DEF("fentry.s/", TRACING,
+ .expected_attach_type = BPF_TRACE_FENTRY,
+ .is_attach_btf = true,
+ .is_sleepable = true,
+ .attach_fn = attach_trace),
+ SEC_DEF("fmod_ret.s/", TRACING,
+ .expected_attach_type = BPF_MODIFY_RETURN,
+ .is_attach_btf = true,
+ .is_sleepable = true,
+ .attach_fn = attach_trace),
+ SEC_DEF("fexit.s/", TRACING,
+ .expected_attach_type = BPF_TRACE_FEXIT,
+ .is_attach_btf = true,
+ .is_sleepable = true,
+ .attach_fn = attach_trace),
SEC_DEF("freplace/", EXT,
.is_attach_btf = true,
.attach_fn = attach_trace),
@@ -7566,6 +7584,11 @@ static const struct bpf_sec_def section_defs[] = {
.is_attach_btf = true,
.expected_attach_type = BPF_LSM_MAC,
.attach_fn = attach_lsm),
+ SEC_DEF("lsm.s/", LSM,
+ .is_attach_btf = true,
+ .is_sleepable = true,
+ .expected_attach_type = BPF_LSM_MAC,
+ .attach_fn = attach_lsm),
SEC_DEF("iter/", TRACING,
.expected_attach_type = BPF_TRACE_ITER,
.is_attach_btf = true,
@@ -8288,7 +8311,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
prog->prog_ifindex = attr->ifindex;
prog->log_level = attr->log_level;
- prog->prog_flags = attr->prog_flags;
+ prog->prog_flags |= attr->prog_flags;
if (!first_prog)
first_prog = prog;
}
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 944ad4721c83..1a427685a8a8 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -317,6 +317,7 @@ extern const struct bench bench_trig_tp;
extern const struct bench bench_trig_rawtp;
extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fentry_sleep;
extern const struct bench bench_trig_fmodret;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
@@ -338,6 +339,7 @@ static const struct bench *benchs[] = {
&bench_trig_rawtp,
&bench_trig_kprobe,
&bench_trig_fentry,
+ &bench_trig_fentry_sleep,
&bench_trig_fmodret,
&bench_rb_libbpf,
&bench_rb_custom,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 49c22832f216..2a0b6c9885a4 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -90,6 +90,12 @@ static void trigger_fentry_setup()
attach_bpf(ctx.skel->progs.bench_trigger_fentry);
}
+static void trigger_fentry_sleep_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep);
+}
+
static void trigger_fmodret_setup()
{
setup_ctx();
@@ -155,6 +161,17 @@ const struct bench bench_trig_fentry = {
.report_final = hits_drops_report_final,
};
+const struct bench bench_trig_fentry_sleep = {
+ .name = "trig-fentry-sleep",
+ .validate = trigger_validate,
+ .setup = trigger_fentry_sleep_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
const struct bench bench_trig_fmodret = {
.name = "trig-fmodret",
.validate = trigger_validate,
diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
index b17eb2045c1d..6ab29226c99b 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -10,6 +10,7 @@
#include <unistd.h>
#include <malloc.h>
#include <stdlib.h>
+#include <unistd.h>
#include "lsm.skel.h"
@@ -55,6 +56,7 @@ void test_test_lsm(void)
{
struct lsm *skel = NULL;
int err, duration = 0;
+ int buf = 1234;
skel = lsm__open_and_load();
if (CHECK(!skel, "skel_load", "lsm skeleton failed\n"))
@@ -81,6 +83,13 @@ void test_test_lsm(void)
CHECK(skel->bss->mprotect_count != 1, "mprotect_count",
"mprotect_count = %d\n", skel->bss->mprotect_count);
+ syscall(__NR_setdomainname, &buf, -2L);
+ syscall(__NR_setdomainname, 0, -3L);
+ syscall(__NR_setdomainname, ~0L, -4L);
+
+ CHECK(skel->bss->copy_test != 3, "copy_test",
+ "copy_test = %d\n", skel->bss->copy_test);
+
close_prog:
lsm__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
index b4598d4bc4f7..49fa6ca99755 100644
--- a/tools/testing/selftests/bpf/progs/lsm.c
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -9,16 +9,41 @@
#include <bpf/bpf_tracing.h>
#include <errno.h>
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} array SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} lru_hash SEC(".maps");
+
char _license[] SEC("license") = "GPL";
int monitored_pid = 0;
int mprotect_count = 0;
int bprm_count = 0;
-SEC("lsm/file_mprotect")
+SEC("lsm.s/file_mprotect")
int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
unsigned long reqprot, unsigned long prot, int ret)
{
+ char args[64];
+ __u32 key = 0;
+ __u64 *value;
+
if (ret != 0)
return ret;
@@ -28,6 +53,18 @@ int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
is_stack = (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack);
+ bpf_copy_from_user(args, sizeof(args), (void *)vma->vm_mm->arg_start);
+
+ value = bpf_map_lookup_elem(&array, &key);
+ if (value)
+ *value = 0;
+ value = bpf_map_lookup_elem(&hash, &key);
+ if (value)
+ *value = 0;
+ value = bpf_map_lookup_elem(&lru_hash, &key);
+ if (value)
+ *value = 0;
+
if (is_stack && monitored_pid == pid) {
mprotect_count++;
ret = -EPERM;
@@ -36,7 +73,7 @@ int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
return ret;
}
-SEC("lsm/bprm_committed_creds")
+SEC("lsm.s/bprm_committed_creds")
int BPF_PROG(test_void_hook, struct linux_binprm *bprm)
{
__u32 pid = bpf_get_current_pid_tgid() >> 32;
@@ -46,3 +83,28 @@ int BPF_PROG(test_void_hook, struct linux_binprm *bprm)
return 0;
}
+SEC("lsm/task_free") /* lsm/ is ok, lsm.s/ fails */
+int BPF_PROG(test_task_free, struct task_struct *task)
+{
+ return 0;
+}
+
+int copy_test = 0;
+
+SEC("fentry.s/__x64_sys_setdomainname")
+int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs)
+{
+ void *ptr = (void *)PT_REGS_PARM1(regs);
+ int len = PT_REGS_PARM2(regs);
+ int buf = 0;
+ long ret;
+
+ ret = bpf_copy_from_user(&buf, sizeof(buf), ptr);
+ if (len == -2 && ret == 0 && buf == 1234)
+ copy_test++;
+ if (len == -3 && ret == -EFAULT)
+ copy_test++;
+ if (len == -4 && ret == -EFAULT)
+ copy_test++;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 8b36b6640e7e..9a4d09590b3d 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -39,6 +39,13 @@ int bench_trigger_fentry(void *ctx)
return 0;
}
+SEC("fentry.s/__x64_sys_getpgid")
+int bench_trigger_fentry_sleep(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
SEC("fmod_ret/__x64_sys_getpgid")
int bench_trigger_fmodret(void *ctx)
{