From 77cd0d7b3f257fd0e3096b4fdcff1a7d38e99e10 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 14 Aug 2019 09:27:17 +0200 Subject: xsk: add support for need_wakeup flag in AF_XDP rings This commit adds support for a new flag called need_wakeup in the AF_XDP Tx and fill rings. When this flag is set, it means that the application has to explicitly wake up the kernel Rx (for the bit in the fill ring) or kernel Tx (for bit in the Tx ring) processing by issuing a syscall. Poll() can wake up both depending on the flags submitted and sendto() will wake up tx processing only. The main reason for introducing this new flag is to be able to efficiently support the case when application and driver is executing on the same core. Previously, the driver was just busy-spinning on the fill ring if it ran out of buffers in the HW and there were none on the fill ring. This approach works when the application is running on another core as it can replenish the fill ring while the driver is busy-spinning. Though, this is a lousy approach if both of them are running on the same core as the probability of the fill ring getting more entries when the driver is busy-spinning is zero. With this new feature the driver now sets the need_wakeup flag and returns to the application. The application can then replenish the fill queue and then explicitly wake up the Rx processing in the kernel using the syscall poll(). For Tx, the flag is only set to one if the driver has no outstanding Tx completion interrupts. If it has some, the flag is zero as it will be woken up by a completion interrupt anyway. As a nice side effect, this new flag also improves the performance of the case where application and driver are running on two different cores as it reduces the number of syscalls to the kernel. The kernel tells user space if it needs to be woken up by a syscall, and this eliminates many of the syscalls. This flag needs some simple driver support. If the driver does not support this, the Rx flag is always zero and the Tx flag is always one. This makes any application relying on this feature default to the old behaviour of not requiring any syscalls in the Rx path and always having to call sendto() in the Tx path. For backwards compatibility reasons, this feature has to be explicitly turned on using a new bind flag (XDP_USE_NEED_WAKEUP). I recommend that you always turn it on as it so far always have had a positive performance impact. The name and inspiration of the flag has been taken from io_uring by Jens Axboe. Details about this feature in io_uring can be found in http://kernel.dk/io_uring.pdf, section 8.3. Signed-off-by: Magnus Karlsson Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index faaa5ca2a117..62b80d57b72a 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -16,6 +16,15 @@ #define XDP_SHARED_UMEM (1 << 0) #define XDP_COPY (1 << 1) /* Force copy-mode */ #define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ +/* If this option is set, the driver might go sleep and in that case + * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be + * set. If it is set, the application need to explicitly wake up the + * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are + * running the driver and the application on the same core, you should + * use this option so that the kernel will yield to the user space + * application. + */ +#define XDP_USE_NEED_WAKEUP (1 << 3) struct sockaddr_xdp { __u16 sxdp_family; @@ -25,10 +34,14 @@ struct sockaddr_xdp { __u32 sxdp_shared_umem_fd; }; +/* XDP_RING flags */ +#define XDP_RING_NEED_WAKEUP (1 << 0) + struct xdp_ring_offset { __u64 producer; __u64 consumer; __u64 desc; + __u64 flags; }; struct xdp_mmap_offsets { -- cgit From 8f51dfc73bf181f2304e1498f55d5f452e060cbe Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 14 Aug 2019 10:37:49 -0700 Subject: bpf: support cloning sk storage on accept() Add new helper bpf_sk_storage_clone which optionally clones sk storage and call it from sk_clone_lock. Cc: Martin KaFai Lau Cc: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/bpf_sk_storage.h | 10 +++++ include/uapi/linux/bpf.h | 3 ++ net/core/bpf_sk_storage.c | 104 +++++++++++++++++++++++++++++++++++++++++-- net/core/sock.c | 9 ++-- 4 files changed, 120 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index b9dcb02e756b..8e4f831d2e52 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +#ifdef CONFIG_BPF_SYSCALL +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); +#else +static inline int bpf_sk_storage_clone(const struct sock *sk, + struct sock *newsk) +{ + return 0; +} +#endif + #endif /* _BPF_SK_STORAGE_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4393bd4b2419..0ef594ac3899 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -337,6 +337,9 @@ enum bpf_attach_type { #define BPF_F_RDONLY_PROG (1U << 7) #define BPF_F_WRONLY_PROG (1U << 8) +/* Clone map from listener for newly accepted socket */ +#define BPF_F_CLONE (1U << 9) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 94c7f77ecb6b..da5639a5bd3b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,6 +12,9 @@ static atomic_t cache_idx; +#define SK_STORAGE_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_CLONE) + struct bucket { struct hlist_head list; raw_spinlock_t lock; @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem) kfree_rcu(sk_storage, rcu); } -/* sk_storage->lock must be held and sk_storage->list cannot be empty */ static void __selem_link_sk(struct bpf_sk_storage *sk_storage, struct bpf_sk_storage_elem *selem) { @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map) return 0; } -/* Called by __sk_destruct() */ +/* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_sk_storage_elem *selem; @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_sk_storage_map *)map; + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ synchronize_rcu(); /* bpf prog and the userspace can no longer access this map @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) { - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries || + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || attr->key_size != sizeof(int) || !attr->value_size || /* Enforce BTF for userspace sk dumping */ !attr->btf_key_type_id || !attr->btf_value_type_id) @@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key) return err; } +static struct bpf_sk_storage_elem * +bpf_sk_storage_clone_elem(struct sock *newsk, + struct bpf_sk_storage_map *smap, + struct bpf_sk_storage_elem *selem) +{ + struct bpf_sk_storage_elem *copy_selem; + + copy_selem = selem_alloc(smap, newsk, NULL, true); + if (!copy_selem) + return NULL; + + if (map_value_has_spin_lock(&smap->map)) + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data, true); + else + copy_map_value(&smap->map, SDATA(copy_selem)->data, + SDATA(selem)->data); + + return copy_selem; +} + +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) +{ + struct bpf_sk_storage *new_sk_storage = NULL; + struct bpf_sk_storage *sk_storage; + struct bpf_sk_storage_elem *selem; + int ret = 0; + + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + + rcu_read_lock(); + sk_storage = rcu_dereference(sk->sk_bpf_storage); + + if (!sk_storage || hlist_empty(&sk_storage->list)) + goto out; + + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { + struct bpf_sk_storage_elem *copy_selem; + struct bpf_sk_storage_map *smap; + struct bpf_map *map; + + smap = rcu_dereference(SDATA(selem)->smap); + if (!(smap->map.map_flags & BPF_F_CLONE)) + continue; + + /* Note that for lockless listeners adding new element + * here can race with cleanup in bpf_sk_storage_map_free. + * Try to grab map refcnt to make sure that it's still + * alive and prevent concurrent removal. + */ + map = bpf_map_inc_not_zero(&smap->map, false); + if (IS_ERR(map)) + continue; + + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem); + if (!copy_selem) { + ret = -ENOMEM; + bpf_map_put(map); + goto out; + } + + if (new_sk_storage) { + selem_link_map(smap, copy_selem); + __selem_link_sk(new_sk_storage, copy_selem); + } else { + ret = sk_storage_alloc(newsk, smap, copy_selem); + if (ret) { + kfree(copy_selem); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } + + new_sk_storage = rcu_dereference(copy_selem->sk_storage); + } + bpf_map_put(map); + } + +out: + rcu_read_unlock(); + + /* In case of an error, don't free anything explicitly here, the + * caller is responsible to call bpf_sk_storage_free. + */ + + return ret; +} + BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { diff --git a/net/core/sock.c b/net/core/sock.c index d57b0cc995a0..f5e801a9cea4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); -#ifdef CONFIG_BPF_SYSCALL - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); -#endif + + if (bpf_sk_storage_clone(sk, newsk)) { + sk_free_unlock_clone(newsk); + newsk = NULL; + goto out; + } newsk->sk_err = 0; newsk->sk_err_soft = 0; -- cgit From 1b9ed84ecf268904d89edf2908426a8eb3b5a4ba Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 20 Aug 2019 10:31:50 +0100 Subject: bpf: add new BPF_BTF_GET_NEXT_ID syscall command Add a new command for the bpf() system call: BPF_BTF_GET_NEXT_ID is used to cycle through all BTF objects loaded on the system. The motivation is to be able to inspect (list) all BTF objects presents on the system. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 4 ++-- kernel/bpf/syscall.c | 4 ++++ 4 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 15ae49862b82..5b9d22338606 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,9 @@ struct seq_file; struct btf; struct btf_type; +extern struct idr btf_idr; +extern spinlock_t btf_idr_lock; + /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ef594ac3899..8aa6126f0b6e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -106,6 +106,7 @@ enum bpf_cmd { BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, }; enum bpf_map_type { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b403dc18486..adb3adcebe3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -195,8 +195,8 @@ i < btf_type_vlen(struct_type); \ i++, member++) -static DEFINE_IDR(btf_idr); -static DEFINE_SPINLOCK(btf_idr_lock); +DEFINE_IDR(btf_idr); +DEFINE_SPINLOCK(btf_idr_lock); struct btf { void *data; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf8052b016e7..c0f62fd67c6b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2884,6 +2884,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_BTF_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &btf_idr, &btf_idr_lock); + break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; -- cgit From 8050a395112db5bedb7caa6cb673e711a3c04cd9 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 21 Aug 2019 00:08:58 +0100 Subject: bpf: fix 'struct pt_reg' typo in documentation There is no 'struct pt_reg'. Signed-off-by: Peter Wu Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8aa6126f0b6e..267544e140be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1018,7 +1018,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1080,7 +1080,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) + * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1729,7 +1729,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_reg *regs, u64 rc) + * int bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. -- cgit From 55c33dfbeb831eb3ab7cc1a3e295b0d4d57f23a3 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 21 Aug 2019 00:08:59 +0100 Subject: bpf: clarify when bpf_trace_printk discards lines I opened /sys/kernel/tracing/trace once and kept reading from it. bpf_trace_printk somehow did not seem to work, no entries were appended to that trace file. It turns out that tracing is disabled when that file is open. Save the next person some time and document this. The trace file is described in Documentation/trace/ftrace.rst, however the implication "tracing is disabled" did not immediate translate to "bpf_trace_printk silently discards entries". Signed-off-by: Peter Wu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 267544e140be..b5889257cc33 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -580,6 +580,8 @@ union bpf_attr { * limited to five). * * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is + * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in * *\/sys/kernel/debug/tracing/trace_options* (see also the -- cgit From 10d274e880eb208ec6a76261a9f8f8155020f771 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Aug 2019 22:52:12 -0700 Subject: bpf: introduce verifier internal test flag Introduce BPF_F_TEST_STATE_FREQ flag to stress test parentage chain and state pruning. Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 5 ++++- 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5fe99f322b1c..26a6d58ca78c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -355,6 +355,7 @@ struct bpf_verifier_env { struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ + bool test_state_freq; /* test verifier with different pruning frequency */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b5889257cc33..5d2fb183ee2d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -285,6 +285,9 @@ enum bpf_attach_type { */ #define BPF_F_TEST_RND_HI32 (1U << 2) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0f62fd67c6b..ca60eafa6922 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1629,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10c0ff93f52b..f340cfe53c6e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7222,7 +7222,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = false; + bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; if (!env->insn_aux_data[insn_idx].prune_point) @@ -9262,6 +9262,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->allow_ptr_leaks = is_priv; + if (is_priv) + env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit From c05cd3645814724bdeb32a2b4d953b12bdea5f8c Mon Sep 17 00:00:00 2001 From: Kevin Laatz Date: Tue, 27 Aug 2019 02:25:22 +0000 Subject: xsk: add support to allow unaligned chunk placement Currently, addresses are chunk size aligned. This means, we are very restricted in terms of where we can place chunk within the umem. For example, if we have a chunk size of 2k, then our chunks can only be placed at 0,2k,4k,6k,8k... and so on (ie. every 2k starting from 0). This patch introduces the ability to use unaligned chunks. With these changes, we are no longer bound to having to place chunks at a 2k (or whatever your chunk size is) interval. Since we are no longer dealing with aligned chunks, they can now cross page boundaries. Checks for page contiguity have been added in order to keep track of which pages are followed by a physically contiguous page. Signed-off-by: Kevin Laatz Signed-off-by: Ciara Loftus Signed-off-by: Bruce Richardson Acked-by: Jonathan Lemon Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 75 ++++++++++++++++++++++++++++++++++-- include/uapi/linux/if_xdp.h | 9 +++++ net/xdp/xdp_umem.c | 19 ++++++--- net/xdp/xsk.c | 94 ++++++++++++++++++++++++++++++++++++--------- net/xdp/xsk_diag.c | 2 +- net/xdp/xsk_queue.h | 70 +++++++++++++++++++++++++++++---- 6 files changed, 233 insertions(+), 36 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f023b9940d64..c9398ce7960f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -16,6 +16,13 @@ struct net_device; struct xsk_queue; +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT) + struct xdp_umem_page { void *addr; dma_addr_t dma; @@ -27,8 +34,12 @@ struct xdp_umem_fq_reuse { u64 handles[]; }; -/* Flags for the umem flags field. */ -#define XDP_UMEM_USES_NEED_WAKEUP (1 << 0) +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1) struct xdp_umem { struct xsk_queue *fq; @@ -124,14 +135,36 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); +static inline u64 xsk_umem_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static inline u64 xsk_umem_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static inline u64 xsk_umem_add_offset_to_addr(u64 addr) +{ + return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); +} + static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); + unsigned long page_addr; + + addr = xsk_umem_add_offset_to_addr(addr); + page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; + + return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); } static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) { - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); + addr = xsk_umem_add_offset_to_addr(addr); + + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); } /* Reuse-queue aware version of FILL queue helpers */ @@ -172,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) rq->handles[rq->length++] = addr; } + +/* Handle the offset appropriately depending on aligned or unaligned mode. + * For unaligned mode, we store the offset in the upper 16-bits of the address. + * For aligned mode, we simply add the offset to the address. + */ +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, + u64 offset) +{ + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) + return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + else + return address + offset; +} #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -241,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, return NULL; } +static inline u64 xsk_umem_extract_addr(u64 addr) +{ + return 0; +} + +static inline u64 xsk_umem_extract_offset(u64 addr) +{ + return 0; +} + +static inline u64 xsk_umem_add_offset_to_addr(u64 addr) +{ + return 0; +} + static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { return NULL; @@ -290,6 +351,12 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) return false; } +static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, + u64 offset) +{ + return 0; +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 62b80d57b72a..be328c59389d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -26,6 +26,9 @@ */ #define XDP_USE_NEED_WAKEUP (1 << 3) +/* Flags for xsk_umem_config flags */ +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + struct sockaddr_xdp { __u16 sxdp_family; __u16 sxdp_flags; @@ -66,6 +69,7 @@ struct xdp_umem_reg { __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; + __u32 flags; }; struct xdp_statistics { @@ -87,6 +91,11 @@ struct xdp_options { #define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL +/* Masks for unaligned chunks mode */ +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 +#define XSK_UNALIGNED_BUF_ADDR_MASK \ + ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2d65779282a1..e997b263a0dd 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -340,6 +340,7 @@ static int xdp_umem_account_pages(struct xdp_umem *umem) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { + bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; @@ -355,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; } - if (!is_power_of_2(chunk_size)) + if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG | + XDP_UMEM_USES_NEED_WAKEUP)) + return -EINVAL; + + if (!unaligned_chunks && !is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { @@ -372,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (chunks == 0) return -EINVAL; - chunks_per_page = PAGE_SIZE / chunk_size; - if (chunks < chunks_per_page || chunks % chunks_per_page) - return -EINVAL; + if (!unaligned_chunks) { + chunks_per_page = PAGE_SIZE / chunk_size; + if (chunks < chunks_per_page || chunks % chunks_per_page) + return -EINVAL; + } headroom = ALIGN(headroom, 64); @@ -383,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) return -EINVAL; umem->address = (unsigned long)addr; - umem->chunk_mask = ~((u64)chunk_size - 1); + umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK + : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; + umem->flags = mr->flags; INIT_LIST_HEAD(&umem->xsk_list); spin_lock_init(&umem->xsk_list_lock); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ee4428a892fa..187fd157fcff 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { - return xskq_peek_addr(umem->fq, addr); + return xskq_peek_addr(umem->fq, addr, umem); } EXPORT_SYMBOL(xsk_umem_peek_addr); @@ -115,21 +115,43 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); +/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for + * each page. This is only required in copy mode. + */ +static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, + u32 len, u32 metalen) +{ + void *to_buf = xdp_umem_get_data(umem, addr); + + addr = xsk_umem_add_offset_to_addr(addr); + if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) { + void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; + u64 page_start = addr & ~(PAGE_SIZE - 1); + u64 first_len = PAGE_SIZE - (addr - page_start); + + memcpy(to_buf, from_buf, first_len + metalen); + memcpy(next_pg_addr, from_buf + first_len, len - first_len); + + return; + } + + memcpy(to_buf, from_buf, len + metalen); +} + static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *to_buf, *from_buf; + u64 offset = xs->umem->headroom; + u64 addr, memcpy_addr; + void *from_buf; u32 metalen; - u64 addr; int err; - if (!xskq_peek_addr(xs->umem->fq, &addr) || + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { xs->rx_dropped++; return -ENOSPC; } - addr += xs->umem->headroom; - if (unlikely(xdp_data_meta_unsupported(xdp))) { from_buf = xdp->data; metalen = 0; @@ -138,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) metalen = xdp->data - xdp->data_meta; } - to_buf = xdp_umem_get_data(xs->umem, addr); - memcpy(to_buf, from_buf, len + metalen); - addr += metalen; + memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); + __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); + + offset += metalen; + addr = xsk_umem_adjust_offset(xs->umem, addr, offset); err = xskq_produce_batch_desc(xs->rx, addr, len); if (!err) { xskq_discard_addr(xs->umem->fq); @@ -185,6 +209,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 metalen = xdp->data - xdp->data_meta; u32 len = xdp->data_end - xdp->data; + u64 offset = xs->umem->headroom; void *buffer; u64 addr; int err; @@ -196,17 +221,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) goto out_unlock; } - if (!xskq_peek_addr(xs->umem->fq, &addr) || + if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) || len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { err = -ENOSPC; goto out_drop; } - addr += xs->umem->headroom; - + addr = xsk_umem_adjust_offset(xs->umem, addr, offset); buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data_meta, len + metalen); - addr += metalen; + + addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); err = xskq_produce_batch_desc(xs->rx, addr, len); if (err) goto out_drop; @@ -250,7 +275,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { - if (!xskq_peek_desc(xs->tx, desc)) + if (!xskq_peek_desc(xs->tx, desc, umem)) continue; if (xskq_produce_addr_lazy(umem->cq, desc->addr)) @@ -304,7 +329,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - while (xskq_peek_desc(xs->tx, &desc)) { + while (xskq_peek_desc(xs->tx, &desc, xs->umem)) { char *buffer; u64 addr; u32 len; @@ -333,7 +358,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->dev = xs->dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)addr; + skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); @@ -526,6 +551,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } +/* Check if umem pages are contiguous. + * If zero-copy mode, use the DMA address to do the page contiguity check + * For all other modes we use addr (kernel virtual address) + * Store the result in the low bits of addr. + */ +static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) +{ + struct xdp_umem_page *pgs = umem->pages; + int i, is_contig; + + for (i = 0; i < umem->npgs - 1; i++) { + is_contig = (flags & XDP_ZEROCOPY) ? + (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : + (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); + pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; + } +} + static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -616,6 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; + + xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; @@ -636,6 +681,13 @@ out_release: return err; } +struct xdp_umem_reg_v1 { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; +}; + static int xsk_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -673,10 +725,16 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, } case XDP_UMEM_REG: { - struct xdp_umem_reg mr; + size_t mr_size = sizeof(struct xdp_umem_reg); + struct xdp_umem_reg mr = {}; struct xdp_umem *umem; - if (copy_from_user(&mr, optval, sizeof(mr))) + if (optlen < sizeof(struct xdp_umem_reg_v1)) + return -EINVAL; + else if (optlen < sizeof(mr)) + mr_size = sizeof(struct xdp_umem_reg_v1); + + if (copy_from_user(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index d5e06c8e0cbf..9986a759fe06 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = (__u32)(~umem->chunk_mask + 1); + du.chunk_size = umem->chunk_size_nohr + umem->headroom; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index dd9e985c2461..eddae4688862 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -134,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt) /* UMEM queue */ +static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr, + u64 length) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; + bool next_pg_contig = + (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & + XSK_NEXT_PG_CONTIG_MASK; + + return cross_pg && !next_pg_contig; +} + static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) { if (addr >= q->size) { @@ -144,23 +155,51 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr) return true; } -static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr) +static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr, + u64 length, + struct xdp_umem *umem) +{ + u64 base_addr = xsk_umem_extract_addr(addr); + + addr = xsk_umem_add_offset_to_addr(addr); + if (base_addr >= q->size || addr >= q->size || + xskq_crosses_non_contig_pg(umem, addr, length)) { + q->invalid_descs++; + return false; + } + + return true; +} + +static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { while (q->cons_tail != q->cons_head) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask; + + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { + if (xskq_is_valid_addr_unaligned(q, *addr, + umem->chunk_size_nohr, + umem)) + return addr; + goto out; + } + if (xskq_is_valid_addr(q, *addr)) return addr; +out: q->cons_tail++; } return NULL; } -static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) +static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr, + struct xdp_umem *umem) { if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ @@ -171,7 +210,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr) smp_rmb(); } - return xskq_validate_addr(q, addr); + return xskq_validate_addr(q, addr, umem); } static inline void xskq_discard_addr(struct xsk_queue *q) @@ -230,8 +269,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q) /* Rx/Tx queue */ -static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, + struct xdp_umem *umem) { + if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { + if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem)) + return false; + + if (d->len > umem->chunk_size_nohr || d->options) { + q->invalid_descs++; + return false; + } + + return true; + } + if (!xskq_is_valid_addr(q, d->addr)) return false; @@ -245,14 +297,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) } static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, - struct xdp_desc *desc) + struct xdp_desc *desc, + struct xdp_umem *umem) { while (q->cons_tail != q->cons_head) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; unsigned int idx = q->cons_tail & q->ring_mask; *desc = READ_ONCE(ring->desc[idx]); - if (xskq_is_valid_desc(q, desc)) + if (xskq_is_valid_desc(q, desc, umem)) return desc; q->cons_tail++; @@ -262,7 +315,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, } static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, - struct xdp_desc *desc) + struct xdp_desc *desc, + struct xdp_umem *umem) { if (q->cons_tail == q->cons_head) { smp_mb(); /* D, matches A */ @@ -273,7 +327,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, smp_rmb(); /* C, matches B */ } - return xskq_validate_desc(q, desc); + return xskq_validate_desc(q, desc, umem); } static inline void xskq_discard_desc(struct xsk_queue *q) -- cgit