aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/bpf/kfuncs.rst17
-rw-r--r--include/linux/bpf.h95
-rw-r--r--include/linux/bpf_verifier.h3
-rw-r--r--include/linux/filter.h46
-rw-r--r--include/uapi/linux/bpf.h18
-rw-r--r--kernel/bpf/btf.c22
-rw-r--r--kernel/bpf/helpers.c221
-rw-r--r--kernel/bpf/verifier.c415
-rw-r--r--net/core/filter.c108
-rw-r--r--tools/include/uapi/linux/bpf.h18
-rw-r--r--tools/testing/selftests/bpf/DENYLIST.s390x2
-rw-r--r--tools/testing/selftests/bpf/bpf_kfuncs.h38
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cls_redirect.c25
-rw-r--r--tools/testing/selftests/bpf/prog_tests/dynptr.c74
-rw-r--r--tools/testing/selftests/bpf/prog_tests/l4lb_all.c2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c93
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_attach.c11
-rw-r--r--tools/testing/selftests/bpf/progs/dynptr_fail.c287
-rw-r--r--tools/testing/selftests/bpf/progs/dynptr_success.c55
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c980
-rw-r--r--tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c487
-rw-r--r--tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt.c119
-rw-r--r--tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt_dynptr.c114
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_dynptr.c257
-rw-r--r--tools/testing/selftests/bpf/test_tcp_hdr_options.h1
25 files changed, 3322 insertions, 186 deletions
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 226313747be5..9a78533d25ac 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -100,6 +100,23 @@ Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
size parameter, and the value of the constant matters for program safety, __k
suffix should be used.
+2.2.2 __uninit Annotation
+--------------------
+
+This annotation is used to indicate that the argument will be treated as
+uninitialized.
+
+An example is given below::
+
+ __bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit)
+ {
+ ...
+ }
+
+Here, the dynptr will be treated as an uninitialized dynptr. Without this
+annotation, the verifier will reject the program if the dynptr passed in is
+not initialized.
+
.. _BPF_kfunc_nodef:
2.3 Using an existing kernel function
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520b238abd5a..23ec684e660d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -607,11 +607,18 @@ enum bpf_type_flag {
*/
NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS),
+ /* DYNPTR points to sk_buff */
+ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS),
+
+ /* DYNPTR points to xdp_buff */
+ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
-#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
+#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
+ | DYNPTR_TYPE_XDP)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -1124,6 +1131,37 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
return bpf_func(ctx, insnsi);
}
+/* the implementation of the opaque uapi struct bpf_dynptr */
+struct bpf_dynptr_kern {
+ void *data;
+ /* Size represents the number of usable bytes of dynptr data.
+ * If for example the offset is at 4 for a local dynptr whose data is
+ * of type u64, the number of usable bytes is 4.
+ *
+ * The upper 8 bits are reserved. It is as follows:
+ * Bits 0 - 23 = size
+ * Bits 24 - 30 = dynptr type
+ * Bit 31 = whether dynptr is read-only
+ */
+ u32 size;
+ u32 offset;
+} __aligned(8);
+
+enum bpf_dynptr_type {
+ BPF_DYNPTR_TYPE_INVALID,
+ /* Points to memory that is local to the bpf program */
+ BPF_DYNPTR_TYPE_LOCAL,
+ /* Underlying data is a ringbuf record */
+ BPF_DYNPTR_TYPE_RINGBUF,
+ /* Underlying data is a sk_buff */
+ BPF_DYNPTR_TYPE_SKB,
+ /* Underlying data is a xdp_buff */
+ BPF_DYNPTR_TYPE_XDP,
+};
+
+int bpf_dynptr_check_size(u32 size);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
+
#ifdef CONFIG_BPF_JIT
int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
@@ -2266,6 +2304,11 @@ static inline bool has_current_bpf_ctx(void)
}
void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size);
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
@@ -2495,6 +2538,19 @@ static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
}
+
+static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+}
+
+static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+}
+
+static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+}
#endif /* CONFIG_BPF_SYSCALL */
void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -2801,6 +2857,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size);
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+ struct bpf_dynptr_kern *ptr);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
@@ -2822,6 +2880,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
{
return 0;
}
+static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+ struct bpf_dynptr_kern *ptr)
+{
+ return -EOPNOTSUPP;
+}
#endif
#ifdef CONFIG_INET
@@ -2913,36 +2976,6 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
u32 num_args, struct bpf_bprintf_data *data);
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
-/* the implementation of the opaque uapi struct bpf_dynptr */
-struct bpf_dynptr_kern {
- void *data;
- /* Size represents the number of usable bytes of dynptr data.
- * If for example the offset is at 4 for a local dynptr whose data is
- * of type u64, the number of usable bytes is 4.
- *
- * The upper 8 bits are reserved. It is as follows:
- * Bits 0 - 23 = size
- * Bits 24 - 30 = dynptr type
- * Bit 31 = whether dynptr is read-only
- */
- u32 size;
- u32 offset;
-} __aligned(8);
-
-enum bpf_dynptr_type {
- BPF_DYNPTR_TYPE_INVALID,
- /* Points to memory that is local to the bpf program */
- BPF_DYNPTR_TYPE_LOCAL,
- /* Underlying data is a kernel-produced ringbuf record */
- BPF_DYNPTR_TYPE_RINGBUF,
-};
-
-void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
- enum bpf_dynptr_type type, u32 offset, u32 size);
-void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
-int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
-
#ifdef CONFIG_BPF_LSM
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
void bpf_cgroup_atype_put(int cgroup_atype);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cf1bb1cf4a7b..b26ff2a8f63b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
enum bpf_arg_type arg_type);
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size);
-struct bpf_call_arg_meta;
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
- enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1727898f1641..efa5d4a1677e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1542,4 +1542,50 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index
return XDP_REDIRECT;
}
+#ifdef CONFIG_NET
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+ u32 len, u64 flags);
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush);
+#else /* CONFIG_NET */
+static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
+ void *to, u32 len)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
+ void *buf, u32 len)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
+ void *buf, u32 len)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ return NULL;
+}
+
+static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
+ unsigned long len, bool flush)
+{
+ return NULL;
+}
+#endif /* CONFIG_NET */
+
#endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..c9699304aed2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5325,11 +5325,22 @@ union bpf_attr {
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
- * *flags* is currently unused.
+ *
+ * *flags* must be 0 except for skb-type dynptrs.
+ *
+ * For skb-type dynptrs:
+ * * All data slices of the dynptr are automatically
+ * invalidated after **bpf_dynptr_write**\ (). This is
+ * because writing may pull the skb and change the
+ * underlying packet buffer.
+ *
+ * * For *flags*, please see the flags accepted by
+ * **bpf_skb_store_bytes**\ ().
* Return
* 0 on success, -E2BIG if *offset* + *len* exceeds the length
* of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- * is a read-only dynptr or if *flags* is not 0.
+ * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ * other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* Description
@@ -5337,6 +5348,9 @@ union bpf_attr {
*
* *len* must be a statically known value. The returned data slice
* is invalidated whenever the dynptr is invalidated.
+ *
+ * skb and xdp type dynptrs may not use bpf_dynptr_data. They should
+ * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
* Return
* Pointer to the underlying dynptr data, NULL if the dynptr is
* read-only, if the dynptr is invalid, or if the offset and length
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index fa22ec79ac0e..ef2d8969ed1f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -207,6 +207,11 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_MAX,
};
@@ -5683,6 +5688,10 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
+ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return ctx_type;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
@@ -7704,6 +7713,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
default:
return BTF_KFUNC_HOOK_MAX;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abdcc52f90a6..648b29e78b84 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1420,11 +1420,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
return ptr->size & DYNPTR_RDONLY_BIT;
}
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
{
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
@@ -1497,6 +1507,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
if (!src->data || flags)
@@ -1506,13 +1517,25 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst, src->data + src->offset + offset, len);
+ type = bpf_dynptr_get_type(src);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_read_proto = {
@@ -1529,22 +1552,40 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
- if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst->data + dst->offset + offset, src, len);
+ type = bpf_dynptr_get_type(dst);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_write_proto = {
@@ -1560,6 +1601,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
+ enum bpf_dynptr_type type;
int err;
if (!ptr->data)
@@ -1572,7 +1614,20 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
if (bpf_dynptr_is_rdonly(ptr))
return 0;
- return (unsigned long)(ptr->data + ptr->offset + offset);
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
}
static const struct bpf_func_proto bpf_dynptr_data_proto = {
@@ -2138,6 +2193,142 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
return p;
}
+/**
+ * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ enum bpf_dynptr_type type;
+ u32 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (xdp_ptr)
+ return xdp_ptr;
+
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+ return buffer;
+ }
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return 0;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possiblities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -2207,6 +2398,8 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
BTF_ID_FLAGS(func, bpf_rdonly_cast)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5cb8b623f639..a856896e835a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -268,7 +268,6 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
- u8 uninit_dynptr_regno;
};
struct btf *btf_vmlinux;
@@ -751,11 +750,31 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_LOCAL;
case DYNPTR_TYPE_RINGBUF:
return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
}
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ default:
+ return 0;
+ }
+}
+
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -959,39 +978,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ int spi;
+
if (reg->type == CONST_PTR_TO_DYNPTR)
return false;
- /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
- * will do check_mem_access to check and update stack bounds later, so
- * return true for that case.
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
*/
- if (spi < 0)
- return spi == -ERANGE;
- /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
- * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
- * ensure dynptr objects at the slots we are touching are completely
- * destructed before we reinitialize them for a new one. For referenced
- * ones, destroy_if_dynptr_stack_slot returns an error early instead of
- * delaying it until the end where the user will get "Unreleased
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
* reference" error.
*/
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int i;
+ int i, spi;
- /* This already represents first slot of initialized bpf_dynptr */
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
+ spi = dynptr_get_spi(env, reg);
if (spi < 0)
return false;
if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
@@ -1668,6 +1697,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -6215,11 +6250,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
* Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
* type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
- enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- int spi = 0;
+ int err;
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -6228,15 +6263,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
return -EFAULT;
}
- /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
- * check_func_arg_reg_off's logic. We only need to check offset
- * and its alignment for PTR_TO_STACK.
- */
- if (reg->type == PTR_TO_STACK) {
- spi = dynptr_get_spi(env, reg);
- if (spi < 0 && spi != -ERANGE)
- return spi;
- }
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
@@ -6254,30 +6280,30 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
* to.
*/
if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
return -EINVAL;
}
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
}
- meta->uninit_dynptr_regno = regno;
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
} else /* MEM_RDONLY and None case from above */ {
- int err;
-
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
return -EINVAL;
}
- if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+ if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
regno);
@@ -6295,6 +6321,12 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
case DYNPTR_TYPE_RINGBUF:
err_extra = "ringbuf";
break;
+ case DYNPTR_TYPE_SKB:
+ err_extra = "skb ";
+ break;
+ case DYNPTR_TYPE_XDP:
+ err_extra = "xdp ";
+ break;
default:
err_extra = "<unknown>";
break;
@@ -6306,10 +6338,8 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
}
err = mark_dynptr_read(env, reg);
- if (err)
- return err;
}
- return 0;
+ return err;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -6691,6 +6721,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
}
}
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
@@ -6717,9 +6769,28 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
return state->stack[spi].spilled_ptr.ref_obj_id;
}
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
- const struct bpf_func_proto *fn)
+ const struct bpf_func_proto *fn,
+ int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -6932,7 +7003,7 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, arg_type, meta);
+ err = process_dynptr_func(env, regno, insn_idx, arg_type);
if (err)
return err;
break;
@@ -7380,6 +7451,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
@@ -7387,7 +7461,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
struct bpf_reg_state *reg;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg_is_pkt_pointer_any(reg))
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
mark_reg_invalid(env, reg);
}));
}
@@ -8218,7 +8292,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- err = check_func_arg(env, i, &meta, fn);
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -8243,30 +8317,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
- * is safe to do directly.
- */
- if (meta.uninit_dynptr_regno) {
- if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
- verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
- return -EFAULT;
- }
- /* we write BPF_DW bits (8 bytes) at a time */
- for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
- err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
- i, BPF_DW, BPF_WRITE, -1, false);
- if (err)
- return err;
- }
-
- err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
- fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
- insn_idx);
- if (err)
- return err;
- }
-
if (meta.release_regno) {
err = -EINVAL;
/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
@@ -8351,43 +8401,62 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_dynptr_data:
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
- int id, ref_obj_id;
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
- if (meta.dynptr_id) {
- verbose(env, "verifier internal error: meta.dynptr_id already set\n");
- return -EFAULT;
- }
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
- if (meta.ref_obj_id) {
- verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
- return -EFAULT;
- }
- id = dynptr_id(env, reg);
- if (id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr id\n");
- return id;
- }
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
- ref_obj_id = dynptr_ref_obj_id(env, reg);
- if (ref_obj_id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
- return ref_obj_id;
- }
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
- meta.dynptr_id = id;
- meta.ref_obj_id = ref_obj_id;
- break;
- }
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
}
- if (i == MAX_BPF_FUNC_REG_ARGS) {
- verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
return -EFAULT;
- }
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
break;
+ }
case BPF_FUNC_user_ringbuf_drain:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_user_ringbuf_callback_state);
@@ -8644,6 +8713,11 @@ struct bpf_kfunc_call_arg_meta {
struct {
struct btf_field *field;
} arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ } initialized_dynptr;
+ u64 mem_size;
};
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
@@ -8717,6 +8791,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
return __kfunc_param_match_suffix(btf, arg, "__sz");
}
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8732,6 +8819,11 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param
return __kfunc_param_match_suffix(btf, arg, "__alloc");
}
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__uninit");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -8898,6 +8990,10 @@ enum special_kfunc_type {
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add,
KF_bpf_rbtree_first,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
};
BTF_SET_START(special_kfunc_set)
@@ -8912,6 +9008,10 @@ BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -8928,6 +9028,10 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -9007,7 +9111,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
arg_mem_size = true;
/* This is the catch all argument type of register types supported by
@@ -9475,7 +9582,8 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
&meta->arg_rbtree_root.field);
}
-static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
{
const char *func_name = meta->func_name, *ref_tname;
const struct btf *btf = meta->btf;
@@ -9666,16 +9774,43 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+
if (reg->type != PTR_TO_STACK &&
reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
return -EINVAL;
}
- ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
if (ret < 0)
return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ }
+
break;
+ }
case KF_ARG_PTR_TO_LIST_HEAD:
if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -9769,14 +9904,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_MEM_SIZE:
- ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ {
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
return ret;
}
- /* Skip next '__sz' argument */
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
i++;
break;
+ }
case KF_ARG_PTR_TO_CALLBACK:
meta->subprogno = reg->subprogno;
break;
@@ -9880,7 +10034,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
/* Check the arguments */
- err = check_kfunc_args(env, &meta);
+ err = check_kfunc_args(env, &meta, insn_idx);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -10011,6 +10165,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta.arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta.initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
} else {
verbose(env, "kernel function %s unhandled dynamic return type\n",
meta.func_name);
@@ -16345,6 +16535,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ bool seen_direct_write = env->seen_direct_write;
+ bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly);
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
}
return 0;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 1d6f165923bf..8f3124e06133 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1721,6 +1721,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+ u32 len, u64 flags)
+{
+ return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
void *, to, u32, len)
{
@@ -1751,6 +1757,11 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+{
+ return ____bpf_skb_load_bytes(skb, offset, to, len);
+}
+
BPF_CALL_4(bpf_flow_dissector_load_bytes,
const struct bpf_flow_dissector *, ctx, u32, offset,
void *, to, u32, len)
@@ -3828,7 +3839,7 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
{
return xdp_get_buff_len(xdp);
}
@@ -3883,8 +3894,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
-static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
- void *buf, unsigned long len, bool flush)
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
{
unsigned long ptr_len, ptr_off = 0;
skb_frag_t *next_frag, *end_frag;
@@ -3930,7 +3941,7 @@ static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
}
}
-static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
u32 size = xdp->data_end - xdp->data;
@@ -3988,6 +3999,11 @@ static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
+}
+
BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
void *, buf, u32, len)
{
@@ -4015,6 +4031,11 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
+}
+
static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
{
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -11621,3 +11642,82 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
return func;
}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ if (flags) {
+ bpf_dynptr_set_null(ptr__uninit);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ if (flags) {
+ bpf_dynptr_set_null(ptr__uninit);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+
+ return 0;
+}
+__diag_pop();
+
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ int err;
+
+ err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
+ if (err)
+ return err;
+
+ bpf_dynptr_set_rdonly(ptr__uninit);
+
+ return 0;
+}
+
+BTF_SET8_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
+BTF_SET8_END(bpf_kfunc_check_set_skb)
+
+BTF_SET8_START(bpf_kfunc_check_set_xdp)
+BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_SET8_END(bpf_kfunc_check_set_xdp)
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_xdp,
+};
+
+static int __init bpf_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+}
+late_initcall(bpf_kfunc_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..c9699304aed2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5325,11 +5325,22 @@ union bpf_attr {
* Description
* Write *len* bytes from *src* into *dst*, starting from *offset*
* into *dst*.
- * *flags* is currently unused.
+ *
+ * *flags* must be 0 except for skb-type dynptrs.
+ *
+ * For skb-type dynptrs:
+ * * All data slices of the dynptr are automatically
+ * invalidated after **bpf_dynptr_write**\ (). This is
+ * because writing may pull the skb and change the
+ * underlying packet buffer.
+ *
+ * * For *flags*, please see the flags accepted by
+ * **bpf_skb_store_bytes**\ ().
* Return
* 0 on success, -E2BIG if *offset* + *len* exceeds the length
* of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- * is a read-only dynptr or if *flags* is not 0.
+ * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ * other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
*
* void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
* Description
@@ -5337,6 +5348,9 @@ union bpf_attr {
*
* *len* must be a statically known value. The returned data slice
* is invalidated whenever the dynptr is invalidated.
+ *
+ * skb and xdp type dynptrs may not use bpf_dynptr_data. They should
+ * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
* Return
* Pointer to the underlying dynptr data, NULL if the dynptr is
* read-only, if the dynptr is invalid, or if the offset and length
diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index b89eb87034e4..a02a085e7f32 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -4,6 +4,8 @@ bloom_filter_map # failed to find kernel BTF type ID of
bpf_cookie # failed to open_and_load program: -524 (trampoline)
bpf_loop # attaches to __x64_sys_nanosleep
cgrp_local_storage # prog_attach unexpected error: -524 (trampoline)
+dynptr/test_dynptr_skb_data
+dynptr/test_skb_readonly
fexit_sleep # fexit_skel_load fexit skeleton failed (trampoline)
get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace)
kprobe_multi_bench_attach # bpf_program__attach_kprobe_multi_opts unexpected error: -95
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
new file mode 100644
index 000000000000..8c993ec8ceea
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -0,0 +1,38 @@
+#ifndef __BPF_KFUNCS__
+#define __BPF_KFUNCS__
+
+/* Description
+ * Initializes an skb-type dynptr
+ * Returns
+ * Error code
+ */
+extern int bpf_dynptr_from_skb(struct __sk_buff *skb, __u64 flags,
+ struct bpf_dynptr *ptr__uninit) __ksym;
+
+/* Description
+ * Initializes an xdp-type dynptr
+ * Returns
+ * Error code
+ */
+extern int bpf_dynptr_from_xdp(struct xdp_md *xdp, __u64 flags,
+ struct bpf_dynptr *ptr__uninit) __ksym;
+
+/* Description
+ * Obtain a read-only pointer to the dynptr's data
+ * Returns
+ * Either a direct pointer to the dynptr data or a pointer to the user-provided
+ * buffer if unable to obtain a direct pointer
+ */
+extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u32 offset,
+ void *buffer, __u32 buffer__szk) __ksym;
+
+/* Description
+ * Obtain a read-write pointer to the dynptr's data
+ * Returns
+ * Either a direct pointer to the dynptr data or a pointer to the user-provided
+ * buffer if unable to obtain a direct pointer
+ */
+extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u32 offset,
+ void *buffer, __u32 buffer__szk) __ksym;
+
+#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
index 224f016b0a53..2a55f717fc07 100644
--- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -13,6 +13,7 @@
#include "progs/test_cls_redirect.h"
#include "test_cls_redirect.skel.h"
+#include "test_cls_redirect_dynptr.skel.h"
#include "test_cls_redirect_subprogs.skel.h"
#define ENCAP_IP INADDR_LOOPBACK
@@ -446,6 +447,28 @@ cleanup:
close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
}
+static void test_cls_redirect_dynptr(void)
+{
+ struct test_cls_redirect_dynptr *skel;
+ int err;
+
+ skel = test_cls_redirect_dynptr__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+ skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+ err = test_cls_redirect_dynptr__load(skel);
+ if (!ASSERT_OK(err, "skel_load"))
+ goto cleanup;
+
+ test_cls_redirect_common(skel->progs.cls_redirect);
+
+cleanup:
+ test_cls_redirect_dynptr__destroy(skel);
+}
+
static void test_cls_redirect_inlined(void)
{
struct test_cls_redirect *skel;
@@ -496,4 +519,6 @@ void test_cls_redirect(void)
test_cls_redirect_inlined();
if (test__start_subtest("cls_redirect_subprogs"))
test_cls_redirect_subprogs();
+ if (test__start_subtest("cls_redirect_dynptr"))
+ test_cls_redirect_dynptr();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index b99264ec0d9c..d176c34a7d2e 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -2,20 +2,32 @@
/* Copyright (c) 2022 Facebook */
#include <test_progs.h>
+#include <network_helpers.h>
#include "dynptr_fail.skel.h"
#include "dynptr_success.skel.h"
-static const char * const success_tests[] = {
- "test_read_write",
- "test_data_slice",
- "test_ringbuf",
+enum test_setup_type {
+ SETUP_SYSCALL_SLEEP,
+ SETUP_SKB_PROG,
};
-static void verify_success(const char *prog_name)
+static struct {
+ const char *prog_name;
+ enum test_setup_type type;
+} success_tests[] = {
+ {"test_read_write", SETUP_SYSCALL_SLEEP},
+ {"test_dynptr_data", SETUP_SYSCALL_SLEEP},
+ {"test_ringbuf", SETUP_SYSCALL_SLEEP},
+ {"test_skb_readonly", SETUP_SKB_PROG},
+ {"test_dynptr_skb_data", SETUP_SKB_PROG},
+};
+
+static void verify_success(const char *prog_name, enum test_setup_type setup_type)
{
struct dynptr_success *skel;
struct bpf_program *prog;
struct bpf_link *link;
+ int err;
skel = dynptr_success__open();
if (!ASSERT_OK_PTR(skel, "dynptr_success__open"))
@@ -23,23 +35,53 @@ static void verify_success(const char *prog_name)
skel->bss->pid = getpid();
- dynptr_success__load(skel);
- if (!ASSERT_OK_PTR(skel, "dynptr_success__load"))
- goto cleanup;
-
prog = bpf_object__find_program_by_name(skel->obj, prog_name);
if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
goto cleanup;
- link = bpf_program__attach(prog);
- if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+ bpf_program__set_autoload(prog, true);
+
+ err = dynptr_success__load(skel);
+ if (!ASSERT_OK(err, "dynptr_success__load"))
goto cleanup;
- usleep(1);
+ switch (setup_type) {
+ case SETUP_SYSCALL_SLEEP:
+ link = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+ goto cleanup;
- ASSERT_EQ(skel->bss->err, 0, "err");
+ usleep(1);
+
+ bpf_link__destroy(link);
+ break;
+ case SETUP_SKB_PROG:
+ {
+ int prog_fd;
+ char buf[64];
+
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .data_out = buf,
+ .data_size_out = sizeof(buf),
+ .repeat = 1,
+ );
- bpf_link__destroy(link);
+ prog_fd = bpf_program__fd(prog);
+ if (!ASSERT_GE(prog_fd, 0, "prog_fd"))
+ goto cleanup;
+
+ err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+ if (!ASSERT_OK(err, "test_run"))
+ goto cleanup;
+
+ break;
+ }
+ }
+
+ ASSERT_EQ(skel->bss->err, 0, "err");
cleanup:
dynptr_success__destroy(skel);
@@ -50,10 +92,10 @@ void test_dynptr(void)
int i;
for (i = 0; i < ARRAY_SIZE(success_tests); i++) {
- if (!test__start_subtest(success_tests[i]))
+ if (!test__start_subtest(success_tests[i].prog_name))
continue;
- verify_success(success_tests[i]);
+ verify_success(success_tests[i].prog_name, success_tests[i].type);
}
RUN_TESTS(dynptr_fail);
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
index 9c1a18573ffd..1eab286b14fe 100644
--- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -93,4 +93,6 @@ void test_l4lb_all(void)
test_l4lb("test_l4lb.bpf.o");
if (test__start_subtest("l4lb_noinline"))
test_l4lb("test_l4lb_noinline.bpf.o");
+ if (test__start_subtest("l4lb_noinline_dynptr"))
+ test_l4lb("test_l4lb_noinline_dynptr.bpf.o");
}
diff --git a/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c
new file mode 100644
index 000000000000..daa952711d8f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/parse_tcp_hdr_opt.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "test_parse_tcp_hdr_opt.skel.h"
+#include "test_parse_tcp_hdr_opt_dynptr.skel.h"
+#include "test_tcp_hdr_options.h"
+
+struct test_pkt {
+ struct ipv6_packet pk6_v6;
+ u8 options[16];
+} __packed;
+
+struct test_pkt pkt = {
+ .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .pk6_v6.iph.nexthdr = IPPROTO_TCP,
+ .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .pk6_v6.tcp.urg_ptr = 123,
+ .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
+
+ .options = {
+ TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
+ 0, 6, 0xBB, 0xBB, 0xBB, 0xBB, TCPOPT_EOL
+ },
+};
+
+static void test_parse_opt(void)
+{
+ struct test_parse_tcp_hdr_opt *skel;
+ struct bpf_program *prog;
+ char buf[128];
+ int err;
+
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .data_in = &pkt,
+ .data_size_in = sizeof(pkt),
+ .data_out = buf,
+ .data_size_out = sizeof(buf),
+ .repeat = 3,
+ );
+
+ skel = test_parse_tcp_hdr_opt__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+ return;
+
+ pkt.options[6] = skel->rodata->tcp_hdr_opt_kind_tpr;
+ prog = skel->progs.xdp_ingress_v6;
+
+ err = bpf_prog_test_run_opts(bpf_program__fd(prog), &topts);
+ ASSERT_OK(err, "ipv6 test_run");
+ ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
+ ASSERT_EQ(skel->bss->server_id, 0xBBBBBBBB, "server id");
+
+ test_parse_tcp_hdr_opt__destroy(skel);
+}
+
+static void test_parse_opt_dynptr(void)
+{
+ struct test_parse_tcp_hdr_opt_dynptr *skel;
+ struct bpf_program *prog;
+ char buf[128];
+ int err;
+
+ LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .data_in = &pkt,
+ .data_size_in = sizeof(pkt),
+ .data_out = buf,
+ .data_size_out = sizeof(buf),
+ .repeat = 3,
+ );
+
+ skel = test_parse_tcp_hdr_opt_dynptr__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+ return;
+
+ pkt.options[6] = skel->rodata->tcp_hdr_opt_kind_tpr;
+ prog = skel->progs.xdp_ingress_v6;
+
+ err = bpf_prog_test_run_opts(bpf_program__fd(prog), &topts);
+ ASSERT_OK(err, "ipv6 test_run");
+ ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
+ ASSERT_EQ(skel->bss->server_id, 0xBBBBBBBB, "server id");
+
+ test_parse_tcp_hdr_opt_dynptr__destroy(skel);
+}
+
+void test_parse_tcp_hdr_opt(void)
+{
+ if (test__start_subtest("parse_tcp_hdr_opt"))
+ test_parse_opt();
+ if (test__start_subtest("parse_tcp_hdr_opt_dynptr"))
+ test_parse_opt_dynptr();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
index d4cd9f873c14..fa3cac5488f5 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
@@ -4,11 +4,10 @@
#define IFINDEX_LO 1
#define XDP_FLAGS_REPLACE (1U << 4)
-void serial_test_xdp_attach(void)
+static void test_xdp_attach(const char *file)
{
__u32 duration = 0, id1, id2, id0 = 0, len;
struct bpf_object *obj1, *obj2, *obj3;
- const char *file = "./test_xdp.bpf.o";
struct bpf_prog_info info = {};
int err, fd1, fd2, fd3;
LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
@@ -85,3 +84,11 @@ out_2:
out_1:
bpf_object__close(obj1);
}
+
+void serial_test_xdp_attach(void)
+{
+ if (test__start_subtest("xdp_attach"))
+ test_xdp_attach("./test_xdp.bpf.o");
+ if (test__start_subtest("xdp_attach_dynptr"))
+ test_xdp_attach("./test_xdp_dynptr.bpf.o");
+}
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index aa5b69354b91..20ce920d891d 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -5,7 +5,9 @@
#include <string.h>
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
+#include <linux/if_ether.h>
#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
char _license[] SEC("license") = "GPL";
@@ -244,6 +246,27 @@ done:
return 0;
}
+/* A data slice can't be accessed out of bounds */
+SEC("?tc")
+__failure __msg("value is outside of the allowed memory range")
+int data_slice_out_of_bounds_skb(struct __sk_buff *skb)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ /* this should fail */
+ *(__u8*)(hdr + 1) = 1;
+
+ return SK_PASS;
+}
+
SEC("?raw_tp")
__failure __msg("value is outside of the allowed memory range")
int data_slice_out_of_bounds_map_value(void *ctx)
@@ -399,7 +422,6 @@ int invalid_helper2(void *ctx)
/* this should fail */
bpf_dynptr_read(read_data, sizeof(read_data), (void *)&ptr + 8, 0, 0);
-
return 0;
}
@@ -1044,6 +1066,193 @@ int dynptr_read_into_slot(void *ctx)
return 0;
}
+/* bpf_dynptr_slice()s are read-only and cannot be written to */
+SEC("?tc")
+__failure __msg("R0 cannot write into rdonly_mem")
+int skb_invalid_slice_write(struct __sk_buff *skb)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ /* this should fail */
+ hdr->h_proto = 1;
+
+ return SK_PASS;
+}
+
+/* The read-only data slice is invalidated whenever a helper changes packet data */
+SEC("?tc")
+__failure __msg("invalid mem access 'scalar'")
+int skb_invalid_data_slice1(struct __sk_buff *skb)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ val = hdr->h_proto;
+
+ if (bpf_skb_pull_data(skb, skb->len))
+ return SK_DROP;
+
+ /* this should fail */
+ val = hdr->h_proto;
+
+ return SK_PASS;
+}
+
+/* The read-write data slice is invalidated whenever a helper changes packet data */
+SEC("?tc")
+__failure __msg("invalid mem access 'scalar'")
+int skb_invalid_data_slice2(struct __sk_buff *skb)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ hdr->h_proto = 123;
+
+ if (bpf_skb_pull_data(skb, skb->len))
+ return SK_DROP;
+
+ /* this should fail */
+ hdr->h_proto = 1;
+
+ return SK_PASS;
+}
+
+/* The read-only data slice is invalidated whenever bpf_dynptr_write() is called */
+SEC("?tc")
+__failure __msg("invalid mem access 'scalar'")
+int skb_invalid_data_slice3(struct __sk_buff *skb)
+{
+ char write_data[64] = "hello there, world!!";
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ val = hdr->h_proto;
+
+ bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
+
+ /* this should fail */
+ val = hdr->h_proto;
+
+ return SK_PASS;
+}
+
+/* The read-write data slice is invalidated whenever bpf_dynptr_write() is called */
+SEC("?tc")
+__failure __msg("invalid mem access 'scalar'")
+int skb_invalid_data_slice4(struct __sk_buff *skb)
+{
+ char write_data[64] = "hello there, world!!";
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ hdr->h_proto = 123;
+
+ bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
+
+ /* this should fail */
+ hdr->h_proto = 1;
+
+ return SK_PASS;
+}
+
+/* The read-only data slice is invalidated whenever a helper changes packet data */
+SEC("?xdp")
+__failure __msg("invalid mem access 'scalar'")
+int xdp_invalid_data_slice1(struct xdp_md *xdp)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_xdp(xdp, 0, &ptr);
+ hdr = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ val = hdr->h_proto;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr)))
+ return XDP_DROP;
+
+ /* this should fail */
+ val = hdr->h_proto;
+
+ return XDP_PASS;
+}
+
+/* The read-write data slice is invalidated whenever a helper changes packet data */
+SEC("?xdp")
+__failure __msg("invalid mem access 'scalar'")
+int xdp_invalid_data_slice2(struct xdp_md *xdp)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_xdp(xdp, 0, &ptr);
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+ if (!hdr)
+ return SK_DROP;
+
+ hdr->h_proto = 9;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr)))
+ return XDP_DROP;
+
+ /* this should fail */
+ hdr->h_proto = 1;
+
+ return XDP_PASS;
+}
+
+/* Only supported prog type can create skb-type dynptrs */
+SEC("?raw_tp")
+__failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed")
+int skb_invalid_ctx(void *ctx)
+{
+ struct bpf_dynptr ptr;
+
+ /* this should fail */
+ bpf_dynptr_from_skb(ctx, 0, &ptr);
+
+ return 0;
+}
+
/* Reject writes to dynptr slot for uninit arg */
SEC("?raw_tp")
__failure __msg("potential write to dynptr at off=-16")
@@ -1061,6 +1270,61 @@ int uninit_write_into_slot(void *ctx)
return 0;
}
+/* Only supported prog type can create xdp-type dynptrs */
+SEC("?raw_tp")
+__failure __msg("calling kernel function bpf_dynptr_from_xdp is not allowed")
+int xdp_invalid_ctx(void *ctx)
+{
+ struct bpf_dynptr ptr;
+
+ /* this should fail */
+ bpf_dynptr_from_xdp(ctx, 0, &ptr);
+
+ return 0;
+}
+
+__u32 hdr_size = sizeof(struct ethhdr);
+/* Can't pass in variable-sized len to bpf_dynptr_slice */
+SEC("?tc")
+__failure __msg("unbounded memory access")
+int dynptr_slice_var_len1(struct __sk_buff *skb)
+{
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+ char buffer[sizeof(*hdr)] = {};
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ /* this should fail */
+ hdr = bpf_dynptr_slice(&ptr, 0, buffer, hdr_size);
+ if (!hdr)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+/* Can't pass in variable-sized len to bpf_dynptr_slice */
+SEC("?tc")
+__failure __msg("must be a known constant")
+int dynptr_slice_var_len2(struct __sk_buff *skb)
+{
+ char buffer[sizeof(struct ethhdr)] = {};
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ if (hdr_size <= sizeof(buffer)) {
+ /* this should fail */
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, hdr_size);
+ if (!hdr)
+ return SK_DROP;
+ hdr->h_proto = 12;
+ }
+
+ return SK_PASS;
+}
+
static int callback(__u32 index, void *data)
{
*(__u32 *)data = 123;
@@ -1092,3 +1356,24 @@ int invalid_data_slices(void *ctx)
return 0;
}
+
+/* Program types that don't allow writes to packet data should fail if
+ * bpf_dynptr_slice_rdwr is called
+ */
+SEC("cgroup_skb/ingress")
+__failure __msg("the prog does not allow writes to packet data")
+int invalid_slice_rdwr_rdonly(struct __sk_buff *skb)
+{
+ char buffer[sizeof(struct ethhdr)] = {};
+ struct bpf_dynptr ptr;
+ struct ethhdr *hdr;
+
+ bpf_dynptr_from_skb(skb, 0, &ptr);
+
+ /* this should fail since cgroup_skb doesn't allow
+ * changing packet data
+ */
+ hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index 35db7c6c1fc7..c8358a7c7924 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -5,6 +5,7 @@
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
#include "errno.h"
char _license[] SEC("license") = "GPL";
@@ -30,7 +31,7 @@ struct {
__type(value, __u32);
} array_map SEC(".maps");
-SEC("tp/syscalls/sys_enter_nanosleep")
+SEC("?tp/syscalls/sys_enter_nanosleep")
int test_read_write(void *ctx)
{
char write_data[64] = "hello there, world!!";
@@ -61,8 +62,8 @@ int test_read_write(void *ctx)
return 0;
}
-SEC("tp/syscalls/sys_enter_nanosleep")
-int test_data_slice(void *ctx)
+SEC("?tp/syscalls/sys_enter_nanosleep")
+int test_dynptr_data(void *ctx)
{
__u32 key = 0, val = 235, *map_val;
struct bpf_dynptr ptr;
@@ -131,7 +132,7 @@ static int ringbuf_callback(__u32 index, void *data)
return 0;
}
-SEC("tp/syscalls/sys_enter_nanosleep")
+SEC("?tp/syscalls/sys_enter_nanosleep")
int test_ringbuf(void *ctx)
{
struct bpf_dynptr ptr;
@@ -163,3 +164,49 @@ done:
bpf_ringbuf_discard_dynptr(&ptr, 0);
return 0;
}
+
+SEC("?cgroup_skb/egress")
+int test_skb_readonly(struct __sk_buff *skb)
+{
+ __u8 write_data[2] = {1, 2};
+ struct bpf_dynptr ptr;
+ __u64 *data;
+ int ret;
+
+ if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+ err = 1;
+ return 1;
+ }
+
+ /* since cgroup skbs are read only, writes should fail */
+ ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
+ if (ret != -EINVAL) {
+ err = 2;
+ return 1;
+ }
+
+ return 1;
+}
+
+SEC("?cgroup_skb/egress")
+int test_dynptr_skb_data(struct __sk_buff *skb)
+{
+ __u8 write_data[2] = {1, 2};
+ struct bpf_dynptr ptr;
+ __u64 *data;
+ int ret;
+
+ if (bpf_dynptr_from_skb(skb, 0, &ptr)) {
+ err = 1;
+ return 1;
+ }
+
+ /* This should return NULL. Must use bpf_dynptr_slice API */
+ data = bpf_dynptr_data(&ptr, 0, 1);
+ if (data) {
+ err = 2;
+ return 1;
+ }
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
new file mode 100644
index 000000000000..f45a7095de7a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c
@@ -0,0 +1,980 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2019, 2020 Cloudflare
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "test_cls_redirect.h"
+#include "bpf_kfuncs.h"
+
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
+
+#define IP_OFFSET_MASK (0x1FFF)
+#define IP_MF (0x2000)
+
+char _license[] SEC("license") = "Dual BSD/GPL";
+
+/**
+ * Destination port and IP used for UDP encapsulation.
+ */
+volatile const __be16 ENCAPSULATION_PORT;
+volatile const __be32 ENCAPSULATION_IP;
+
+typedef struct {
+ uint64_t processed_packets_total;
+ uint64_t l3_protocol_packets_total_ipv4;
+ uint64_t l3_protocol_packets_total_ipv6;
+ uint64_t l4_protocol_packets_total_tcp;
+ uint64_t l4_protocol_packets_total_udp;
+ uint64_t accepted_packets_total_syn;
+ uint64_t accepted_packets_total_syn_cookies;
+ uint64_t accepted_packets_total_last_hop;
+ uint64_t accepted_packets_total_icmp_echo_request;
+ uint64_t accepted_packets_total_established;
+ uint64_t forwarded_packets_total_gue;
+ uint64_t forwarded_packets_total_gre;
+
+ uint64_t errors_total_unknown_l3_proto;
+ uint64_t errors_total_unknown_l4_proto;
+ uint64_t errors_total_malformed_ip;
+ uint64_t errors_total_fragmented_ip;
+ uint64_t errors_total_malformed_icmp;
+ uint64_t errors_total_unwanted_icmp;
+ uint64_t errors_total_malformed_icmp_pkt_too_big;
+ uint64_t errors_total_malformed_tcp;
+ uint64_t errors_total_malformed_udp;
+ uint64_t errors_total_icmp_echo_replies;
+ uint64_t errors_total_malformed_encapsulation;
+ uint64_t errors_total_encap_adjust_failed;
+ uint64_t errors_total_encap_buffer_too_small;
+ uint64_t errors_total_redirect_loop;
+ uint64_t errors_total_encap_mtu_violate;
+} metrics_t;
+
+typedef enum {
+ INVALID = 0,
+ UNKNOWN,
+ ECHO_REQUEST,
+ SYN,
+ SYN_COOKIE,
+ ESTABLISHED,
+} verdict_t;
+
+typedef struct {
+ uint16_t src, dst;
+} flow_ports_t;
+
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv4.dport) -
+ offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv6.dport) -
+ offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+
+struct iphdr_info {
+ void *hdr;
+ __u64 len;
+};
+
+typedef int ret_t;
+
+/* This is a bit of a hack. We need a return value which allows us to
+ * indicate that the regular flow of the program should continue,
+ * while allowing functions to use XDP_PASS and XDP_DROP, etc.
+ */
+static const ret_t CONTINUE_PROCESSING = -1;
+
+/* Convenience macro to call functions which return ret_t.
+ */
+#define MAYBE_RETURN(x) \
+ do { \
+ ret_t __ret = x; \
+ if (__ret != CONTINUE_PROCESSING) \
+ return __ret; \
+ } while (0)
+
+static bool ipv4_is_fragment(const struct iphdr *ip)
+{
+ uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
+ return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
+}
+
+static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
+{
+ if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
+ return -1;
+
+ *offset += sizeof(*iphdr);
+
+ if (iphdr->ihl < 5)
+ return -1;
+
+ /* skip ipv4 options */
+ *offset += (iphdr->ihl - 5) * 4;
+
+ return 0;
+}
+
+/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
+static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
+{
+ if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
+ return false;
+
+ *offset += sizeof(*ports);
+
+ /* Ports in the L4 headers are reversed, since we are parsing an ICMP
+ * payload which is going towards the eyeball.
+ */
+ uint16_t dst = ports->src;
+ ports->src = ports->dst;
+ ports->dst = dst;
+ return true;
+}
+
+static uint16_t pkt_checksum_fold(uint32_t csum)
+{
+ /* The highest reasonable value for an IPv4 header
+ * checksum requires two folds, so we just do that always.
+ */
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (uint16_t)~csum;
+}
+
+static void pkt_ipv4_checksum(struct iphdr *iph)
+{
+ iph->check = 0;
+
+ /* An IP header without options is 20 bytes. Two of those
+ * are the checksum, which we always set to zero. Hence,
+ * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
+ * which fits in 32 bit.
+ */
+ _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
+ uint32_t acc = 0;
+ uint16_t *ipw = (uint16_t *)iph;
+
+ for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
+ acc += ipw[i];
+
+ iph->check = pkt_checksum_fold(acc);
+}
+
+static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
+ const struct ipv6hdr *ipv6, uint8_t *upper_proto,
+ bool *is_fragment)
+{
+ /* We understand five extension headers.
+ * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
+ * headers should occur once, except Destination Options, which may
+ * occur twice. Hence we give up after 6 headers.
+ */
+ struct {
+ uint8_t next;
+ uint8_t len;
+ } exthdr = {
+ .next = ipv6->nexthdr,
+ };
+ *is_fragment = false;
+
+ for (int i = 0; i < 6; i++) {
+ switch (exthdr.next) {
+ case IPPROTO_FRAGMENT:
+ *is_fragment = true;
+ /* NB: We don't check that hdrlen == 0 as per spec. */
+ /* fallthrough; */
+
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_MH:
+ if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
+ return false;
+
+ /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
+ *offset += (exthdr.len + 1) * 8;
+
+ /* Decode next header */
+ break;
+
+ default:
+ /* The next header is not one of the known extension
+ * headers, treat it as the upper layer header.
+ *
+ * This handles IPPROTO_NONE.
+ *
+ * Encapsulating Security Payload (50) and Authentication
+ * Header (51) also end up here (and will trigger an
+ * unknown proto error later). They have a custom header
+ * format and seem too esoteric to care about.
+ */
+ *upper_proto = exthdr.next;
+ return true;
+ }
+ }
+
+ /* We never found an upper layer header. */
+ return false;
+}
+
+static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
+ uint8_t *proto, bool *is_fragment)
+{
+ if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
+ return -1;
+
+ *offset += sizeof(*ipv6);
+
+ if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment))
+ return -1;
+
+ return 0;
+}
+
+/* Global metrics, per CPU
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, unsigned int);
+ __type(value, metrics_t);
+} metrics_map SEC(".maps");
+
+static metrics_t *get_global_metrics(void)
+{
+ uint64_t key = 0;
+ return bpf_map_lookup_elem(&metrics_map, &key);
+}
+
+static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
+{
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
+
+ /* Changing the ethertype if the encapsulated packet is ipv6 */
+ if (encap->gue.proto_ctype == IPPROTO_IPV6)
+ encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
+
+ if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
+ BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
+ bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
+ return TC_ACT_SHOT;
+
+ return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
+}
+
+static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
+ encap_headers_t *encap, struct in_addr *next_hop,
+ metrics_t *metrics)
+{
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead =
+ payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
+ int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
+ __u8 encap_buffer[sizeof(encap_gre_t)] = {};
+ uint16_t proto = ETH_P_IP;
+ uint32_t mtu_len = 0;
+ encap_gre_t *encap_gre;
+
+ metrics->forwarded_packets_total_gre++;
+
+ /* Loop protection: the inner packet's TTL is decremented as a safeguard
+ * against any forwarding loop. As the only interesting field is the TTL
+ * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
+ * as they handle the split packets if needed (no need for the data to be
+ * in the linear section).
+ */
+ if (encap->gue.proto_ctype == IPPROTO_IPV6) {
+ proto = ETH_P_IPV6;
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1, 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ } else {
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
+ 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ /* IPv4 also has a checksum to patch. While the TTL is only one byte,
+ * this function only works for 2 and 4 bytes arguments (the result is
+ * the same).
+ */
+ rc = bpf_l3_csum_replace(
+ skb, payload_off + offsetof(struct iphdr, check), ttl,
+ ttl - 1, 2);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
+ 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ }
+
+ if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
+ metrics->errors_total_encap_mtu_violate++;
+ return TC_ACT_SHOT;
+ }
+
+ if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
+ BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
+ bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
+ metrics->errors_total_encap_adjust_failed++;
+ return TC_ACT_SHOT;
+ }
+
+ if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
+ if (!encap_gre) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ encap_gre->ip.protocol = IPPROTO_GRE;
+ encap_gre->ip.daddr = next_hop->s_addr;
+ encap_gre->ip.saddr = ENCAPSULATION_IP;
+ encap_gre->ip.tot_len =
+ bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
+ encap_gre->gre.flags = 0;
+ encap_gre->gre.protocol = bpf_htons(proto);
+ pkt_ipv4_checksum((void *)&encap_gre->ip);
+
+ if (encap_gre == encap_buffer)
+ bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
+ encap_headers_t *encap, struct in_addr *next_hop,
+ metrics_t *metrics)
+{
+ /* swap L2 addresses */
+ /* This assumes that packets are received from a router.
+ * So just swapping the MAC addresses here will make the packet go back to
+ * the router, which will send it to the appropriate machine.
+ */
+ unsigned char temp[ETH_ALEN];
+ memcpy(temp, encap->eth.h_dest, sizeof(temp));
+ memcpy(encap->eth.h_dest, encap->eth.h_source,
+ sizeof(encap->eth.h_dest));
+ memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
+ encap->unigue.last_hop_gre) {
+ return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
+ }
+
+ metrics->forwarded_packets_total_gue++;
+ uint32_t old_saddr = encap->ip.saddr;
+ encap->ip.saddr = encap->ip.daddr;
+ encap->ip.daddr = next_hop->s_addr;
+ if (encap->unigue.next_hop < encap->unigue.hop_count) {
+ encap->unigue.next_hop++;
+ }
+
+ /* Remove ip->saddr, add next_hop->s_addr */
+ const uint64_t off = offsetof(typeof(*encap), ip.check);
+ int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
+ if (ret < 0) {
+ return TC_ACT_SHOT;
+ }
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static ret_t skip_next_hops(__u64 *offset, int n)
+{
+ __u32 res;
+ switch (n) {
+ case 1:
+ *offset += sizeof(struct in_addr);
+ case 0:
+ return CONTINUE_PROCESSING;
+
+ default:
+ return TC_ACT_SHOT;
+ }
+}
+
+/* Get the next hop from the GLB header.
+ *
+ * Sets next_hop->s_addr to 0 if there are no more hops left.
+ * pkt is positioned just after the variable length GLB header
+ * iff the call is successful.
+ */
+static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
+ struct in_addr *next_hop)
+{
+ if (encap->unigue.next_hop > encap->unigue.hop_count)
+ return TC_ACT_SHOT;
+
+ /* Skip "used" next hops. */
+ MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count) {
+ /* No more next hops, we are at the end of the GLB header. */
+ next_hop->s_addr = 0;
+ return CONTINUE_PROCESSING;
+ }
+
+ if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
+ return TC_ACT_SHOT;
+
+ *offset += sizeof(*next_hop);
+
+ /* Skip the remainig next hops (may be zero). */
+ return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1);
+}
+
+/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
+ * This is a kludge that let's us work around verifier limitations:
+ *
+ * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
+ *
+ * clang will substitue a costant for sizeof, which allows the verifier
+ * to track it's value. Based on this, it can figure out the constant
+ * return value, and calling code works while still being "generic" to
+ * IPv4 and IPv6.
+ */
+static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
+ uint64_t iphlen, uint16_t sport, uint16_t dport)
+{
+ switch (iphlen) {
+ case sizeof(struct iphdr): {
+ struct iphdr *ipv4 = (struct iphdr *)iph;
+ tuple->ipv4.daddr = ipv4->daddr;
+ tuple->ipv4.saddr = ipv4->saddr;
+ tuple->ipv4.sport = sport;
+ tuple->ipv4.dport = dport;
+ return sizeof(tuple->ipv4);
+ }
+
+ case sizeof(struct ipv6hdr): {
+ struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
+ memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
+ sizeof(tuple->ipv6.daddr));
+ memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
+ sizeof(tuple->ipv6.saddr));
+ tuple->ipv6.sport = sport;
+ tuple->ipv6.dport = dport;
+ return sizeof(tuple->ipv6);
+ }
+
+ default:
+ return 0;
+ }
+}
+
+static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
+ uint64_t tuplen, void *iph, struct tcphdr *tcp)
+{
+ struct bpf_sock *sk =
+ bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+
+ if (sk == NULL)
+ return UNKNOWN;
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ if (iph != NULL && tcp != NULL) {
+ /* Kludge: we've run out of arguments, but need the length of the ip header. */
+ uint64_t iphlen = sizeof(struct iphdr);
+
+ if (tuplen == sizeof(tuple->ipv6))
+ iphlen = sizeof(struct ipv6hdr);
+
+ if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
+ sizeof(*tcp)) == 0) {
+ bpf_sk_release(sk);
+ return SYN_COOKIE;
+ }
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
+{
+ struct bpf_sock *sk =
+ bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+
+ if (sk == NULL)
+ return UNKNOWN;
+
+ if (sk->state == BPF_TCP_ESTABLISHED) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
+ uint64_t tuplen, metrics_t *metrics)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ return classify_tcp(skb, tuple, tuplen, NULL, NULL);
+
+ case IPPROTO_UDP:
+ return classify_udp(skb, tuple, tuplen);
+
+ default:
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+}
+
+static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
+ metrics_t *metrics)
+{
+ struct icmphdr icmp;
+ struct iphdr ipv4;
+
+ if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ *offset += sizeof(icmp);
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp.type == ICMP_ECHOREPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp.type == ICMP_ECHO)
+ return ECHO_REQUEST;
+
+ if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ /* The source address in the outer IP header is from the entity that
+ * originated the ICMP message. Use the original IP header to restore
+ * the correct flow tuple.
+ */
+ struct bpf_sock_tuple tuple;
+ tuple.ipv4.saddr = ipv4.daddr;
+ tuple.ipv4.daddr = ipv4.saddr;
+
+ if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(skb, ipv4.protocol, &tuple,
+ sizeof(tuple.ipv4), metrics);
+}
+
+static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
+ metrics_t *metrics)
+{
+ struct bpf_sock_tuple tuple;
+ struct ipv6hdr ipv6;
+ struct icmp6hdr icmp6;
+ bool is_fragment;
+ uint8_t l4_proto;
+
+ if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
+ return ECHO_REQUEST;
+ }
+
+ if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ /* Swap source and dest addresses. */
+ memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
+ memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
+
+ if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6),
+ metrics);
+}
+
+static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
+ struct iphdr_info *info, metrics_t *metrics)
+{
+ struct bpf_sock_tuple tuple;
+ struct tcphdr tcp;
+ uint64_t tuplen;
+
+ metrics->l4_protocol_packets_total_tcp++;
+
+ if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
+ metrics->errors_total_malformed_tcp++;
+ return INVALID;
+ }
+
+ *offset += sizeof(tcp);
+
+ if (tcp.syn)
+ return SYN;
+
+ tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest);
+ return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp);
+}
+
+static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
+ struct iphdr_info *info, metrics_t *metrics)
+{
+ struct bpf_sock_tuple tuple;
+ struct udphdr udph;
+ uint64_t tuplen;
+
+ metrics->l4_protocol_packets_total_udp++;
+
+ if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
+ metrics->errors_total_malformed_udp++;
+ return INVALID;
+ }
+ *offset += sizeof(udph);
+
+ tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest);
+ return classify_udp(skb, &tuple, tuplen);
+}
+
+static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
+ __u64 *offset, metrics_t *metrics)
+{
+ struct iphdr ipv4;
+ struct iphdr_info info = {
+ .hdr = &ipv4,
+ .len = sizeof(ipv4),
+ };
+
+ metrics->l3_protocol_packets_total_ipv4++;
+
+ if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4.version != 4) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4_is_fragment(&ipv4)) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (ipv4.protocol) {
+ case IPPROTO_ICMP:
+ return process_icmpv4(skb, dynptr, offset, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(dynptr, offset, skb, &info, metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(dynptr, offset, skb, &info, metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
+ __u64 *offset, metrics_t *metrics)
+{
+ struct ipv6hdr ipv6;
+ struct iphdr_info info = {
+ .hdr = &ipv6,
+ .len = sizeof(ipv6),
+ };
+ uint8_t l4_proto;
+ bool is_fragment;
+
+ metrics->l3_protocol_packets_total_ipv6++;
+
+ if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv6.version != 6) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (l4_proto) {
+ case IPPROTO_ICMPV6:
+ return process_icmpv6(dynptr, offset, skb, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(dynptr, offset, skb, &info, metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(dynptr, offset, skb, &info, metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+SEC("tc")
+int cls_redirect(struct __sk_buff *skb)
+{
+ __u8 encap_buffer[sizeof(encap_headers_t)] = {};
+ struct bpf_dynptr dynptr;
+ struct in_addr next_hop;
+ /* Tracks offset of the dynptr. This will be unnecessary once
+ * bpf_dynptr_advance() is available.
+ */
+ __u64 off = 0;
+ ret_t ret;
+
+ bpf_dynptr_from_skb(skb, 0, &dynptr);
+
+ metrics_t *metrics = get_global_metrics();
+ if (metrics == NULL)
+ return TC_ACT_SHOT;
+
+ metrics->processed_packets_total++;
+
+ /* Pass bogus packets as long as we're not sure they're
+ * destined for us.
+ */
+ if (skb->protocol != bpf_htons(ETH_P_IP))
+ return TC_ACT_OK;
+
+ encap_headers_t *encap;
+
+ /* Make sure that all encapsulation headers are available in
+ * the linear portion of the skb. This makes it easy to manipulate them.
+ */
+ if (bpf_skb_pull_data(skb, sizeof(*encap)))
+ return TC_ACT_OK;
+
+ encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
+ if (!encap)
+ return TC_ACT_OK;
+
+ off += sizeof(*encap);
+
+ if (encap->ip.ihl != 5)
+ /* We never have any options. */
+ return TC_ACT_OK;
+
+ if (encap->ip.daddr != ENCAPSULATION_IP ||
+ encap->ip.protocol != IPPROTO_UDP)
+ return TC_ACT_OK;
+
+ /* TODO Check UDP length? */
+ if (encap->udp.dest != ENCAPSULATION_PORT)
+ return TC_ACT_OK;
+
+ /* We now know that the packet is destined to us, we can
+ * drop bogus ones.
+ */
+ if (ipv4_is_fragment((void *)&encap->ip)) {
+ metrics->errors_total_fragmented_ip++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.variant != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.control != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.flags != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.hlen !=
+ sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.version != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.reserved != 0)
+ return TC_ACT_SHOT;
+
+ MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
+
+ if (next_hop.s_addr == 0) {
+ metrics->accepted_packets_total_last_hop++;
+ return accept_locally(skb, encap);
+ }
+
+ verdict_t verdict;
+ switch (encap->gue.proto_ctype) {
+ case IPPROTO_IPIP:
+ verdict = process_ipv4(skb, &dynptr, &off, metrics);
+ break;
+
+ case IPPROTO_IPV6:
+ verdict = process_ipv6(skb, &dynptr, &off, metrics);
+ break;
+
+ default:
+ metrics->errors_total_unknown_l3_proto++;
+ return TC_ACT_SHOT;
+ }
+
+ switch (verdict) {
+ case INVALID:
+ /* metrics have already been bumped */
+ return TC_ACT_SHOT;
+
+ case UNKNOWN:
+ return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics);
+
+ case ECHO_REQUEST:
+ metrics->accepted_packets_total_icmp_echo_request++;
+ break;
+
+ case SYN:
+ if (encap->unigue.forward_syn) {
+ return forward_to_next_hop(skb, &dynptr, encap, &next_hop,
+ metrics);
+ }
+
+ metrics->accepted_packets_total_syn++;
+ break;
+
+ case SYN_COOKIE:
+ metrics->accepted_packets_total_syn_cookies++;
+ break;
+
+ case ESTABLISHED:
+ metrics->accepted_packets_total_established++;
+ break;
+ }
+
+ ret = accept_locally(skb, encap);
+
+ if (encap == encap_buffer)
+ bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
new file mode 100644
index 000000000000..f997f5080748
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline_dynptr.c
@@ -0,0 +1,487 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+#include "test_iptunnel_common.h"
+#include <bpf/bpf_endian.h>
+
+#include "bpf_kfuncs.h"
+
+static __always_inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+typedef unsigned int u32;
+
+static __noinline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(u32 *)(k);
+ b += *(u32 *)(k + 4);
+ c += *(u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+ __jhash_final(a, b, c);
+ return c;
+}
+
+static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+ union {
+ __be32 src;
+ __be32 srcv6[4];
+ };
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ union {
+ __u32 ports;
+ __u16 port16[2];
+ };
+ __u8 proto;
+ __u8 flags;
+};
+
+struct ctl_value {
+ union {
+ __u64 value;
+ __u32 ifindex;
+ __u8 mac[6];
+ };
+};
+
+struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+};
+
+struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+};
+
+struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+};
+
+struct eth_hdr {
+ unsigned char eth_dest[ETH_ALEN];
+ unsigned char eth_source[ETH_ALEN];
+ unsigned short eth_proto;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, struct vip);
+ __type(value, struct vip_meta);
+} vip_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CH_RINGS_SIZE);
+ __type(key, __u32);
+ __type(value, __u32);
+} ch_rings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_REALS);
+ __type(key, __u32);
+ __type(value, struct real_definition);
+} reals SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, __u32);
+ __type(value, struct vip_stats);
+} stats SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CTL_MAP_SIZE);
+ __type(key, __u32);
+ __type(value, struct ctl_value);
+} ctl_array SEC(".maps");
+
+static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6)
+{
+ if (ipv6)
+ return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
+ pckt->ports, CH_RINGS_SIZE);
+ else
+ return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
+}
+
+static __noinline bool get_packet_dst(struct real_definition **real,
+ struct packet_description *pckt,
+ struct vip_meta *vip_info,
+ bool is_ipv6)
+{
+ __u32 hash = get_packet_hash(pckt, is_ipv6);
+ __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE;
+ __u32 *real_pos;
+
+ if (hash != 0x358459b7 /* jhash of ipv4 packet */ &&
+ hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
+ return false;
+
+ real_pos = bpf_map_lookup_elem(&ch_rings, &key);
+ if (!real_pos)
+ return false;
+ key = *real_pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+ if (!(*real))
+ return false;
+ return true;
+}
+
+static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
+ struct packet_description *pckt)
+{
+ __u8 buffer[sizeof(struct ipv6hdr)] = {};
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+
+ icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!icmp_hdr)
+ return TC_ACT_SHOT;
+
+ if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
+ return TC_ACT_OK;
+ off += sizeof(struct icmp6hdr);
+ ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!ip6h)
+ return TC_ACT_SHOT;
+ pckt->proto = ip6h->nexthdr;
+ pckt->flags |= F_ICMP;
+ memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
+ memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
+ return TC_ACT_UNSPEC;
+}
+
+static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
+ struct packet_description *pckt)
+{
+ __u8 buffer_icmp[sizeof(struct iphdr)] = {};
+ __u8 buffer_ip[sizeof(struct iphdr)] = {};
+ struct icmphdr *icmp_hdr;
+ struct iphdr *iph;
+
+ icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer_icmp, sizeof(buffer_icmp));
+ if (!icmp_hdr)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->type != ICMP_DEST_UNREACH ||
+ icmp_hdr->code != ICMP_FRAG_NEEDED)
+ return TC_ACT_OK;
+ off += sizeof(struct icmphdr);
+ iph = bpf_dynptr_slice(skb_ptr, off, buffer_ip, sizeof(buffer_ip));
+ if (!iph || iph->ihl != 5)
+ return TC_ACT_SHOT;
+ pckt->proto = iph->protocol;
+ pckt->flags |= F_ICMP;
+ pckt->src = iph->daddr;
+ pckt->dst = iph->saddr;
+ return TC_ACT_UNSPEC;
+}
+
+static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off,
+ struct packet_description *pckt)
+{
+ __u8 buffer[sizeof(struct udphdr)] = {};
+ struct udphdr *udp;
+
+ udp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!udp)
+ return false;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = udp->source;
+ pckt->port16[1] = udp->dest;
+ } else {
+ pckt->port16[0] = udp->dest;
+ pckt->port16[1] = udp->source;
+ }
+ return true;
+}
+
+static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off,
+ struct packet_description *pckt)
+{
+ __u8 buffer[sizeof(struct tcphdr)] = {};
+ struct tcphdr *tcp;
+
+ tcp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!tcp)
+ return false;
+
+ if (tcp->syn)
+ pckt->flags |= F_SYN_SET;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = tcp->source;
+ pckt->port16[1] = tcp->dest;
+ } else {
+ pckt->port16[0] = tcp->dest;
+ pckt->port16[1] = tcp->source;
+ }
+ return true;
+}
+
+static __noinline int process_packet(struct bpf_dynptr *skb_ptr,
+ struct eth_hdr *eth, __u64 off,
+ bool is_ipv6, struct __sk_buff *skb)
+{
+ struct packet_description pckt = {};
+ struct bpf_tunnel_key tkey = {};
+ struct vip_stats *data_stats;
+ struct real_definition *dst;
+ struct vip_meta *vip_info;
+ struct ctl_value *cval;
+ __u32 v4_intf_pos = 1;
+ __u32 v6_intf_pos = 2;
+ struct ipv6hdr *ip6h;
+ struct vip vip = {};
+ struct iphdr *iph;
+ int tun_flag = 0;
+ __u16 pkt_bytes;
+ __u64 iph_len;
+ __u32 ifindex;
+ __u8 protocol;
+ __u32 vip_num;
+ int action;
+
+ tkey.tunnel_ttl = 64;
+ if (is_ipv6) {
+ __u8 buffer[sizeof(struct ipv6hdr)] = {};
+
+ ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!ip6h)
+ return TC_ACT_SHOT;
+
+ iph_len = sizeof(struct ipv6hdr);
+ protocol = ip6h->nexthdr;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(ip6h->payload_len);
+ off += iph_len;
+ if (protocol == IPPROTO_FRAGMENT) {
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_ICMPV6) {
+ action = parse_icmpv6(skb_ptr, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV6_PLUS_ICMP_HDR;
+ } else {
+ memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
+ memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
+ }
+ } else {
+ __u8 buffer[sizeof(struct iphdr)] = {};
+
+ iph = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer));
+ if (!iph || iph->ihl != 5)
+ return TC_ACT_SHOT;
+
+ protocol = iph->protocol;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(iph->tot_len);
+ off += IPV4_HDR_LEN_NO_OPT;
+
+ if (iph->frag_off & PCKT_FRAGMENTED)
+ return TC_ACT_SHOT;
+ if (protocol == IPPROTO_ICMP) {
+ action = parse_icmp(skb_ptr, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV4_PLUS_ICMP_HDR;
+ } else {
+ pckt.src = iph->saddr;
+ pckt.dst = iph->daddr;
+ }
+ }
+ protocol = pckt.proto;
+
+ if (protocol == IPPROTO_TCP) {
+ if (!parse_tcp(skb_ptr, off, &pckt))
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_UDP) {
+ if (!parse_udp(skb_ptr, off, &pckt))
+ return TC_ACT_SHOT;
+ } else {
+ return TC_ACT_SHOT;
+ }
+
+ if (is_ipv6)
+ memcpy(vip.daddr.v6, pckt.dstv6, 16);
+ else
+ vip.daddr.v4 = pckt.dst;
+
+ vip.dport = pckt.port16[1];
+ vip.protocol = pckt.proto;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info) {
+ vip.dport = 0;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info)
+ return TC_ACT_SHOT;
+ pckt.port16[1] = 0;
+ }
+
+ if (vip_info->flags & F_HASH_NO_SRC_PORT)
+ pckt.port16[0] = 0;
+
+ if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
+ return TC_ACT_SHOT;
+
+ if (dst->flags & F_IPV6) {
+ cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ memcpy(tkey.remote_ipv6, dst->dstv6, 16);
+ tun_flag = BPF_F_TUNINFO_IPV6;
+ } else {
+ cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ tkey.remote_ipv4 = dst->dst;
+ }
+ vip_num = vip_info->vip_num;
+ data_stats = bpf_map_lookup_elem(&stats, &vip_num);
+ if (!data_stats)
+ return TC_ACT_SHOT;
+ data_stats->pkts++;
+ data_stats->bytes += pkt_bytes;
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
+ *(u32 *)eth->eth_dest = tkey.remote_ipv4;
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("tc")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ __u8 buffer[sizeof(struct eth_hdr)] = {};
+ struct bpf_dynptr ptr;
+ struct eth_hdr *eth;
+ __u32 eth_proto;
+ __u32 nh_off;
+ int err;
+
+ nh_off = sizeof(struct eth_hdr);
+
+ bpf_dynptr_from_skb(ctx, 0, &ptr);
+ eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer));
+ if (!eth)
+ return TC_ACT_SHOT;
+ eth_proto = eth->eth_proto;
+ if (eth_proto == bpf_htons(ETH_P_IP))
+ err = process_packet(&ptr, eth, nh_off, false, ctx);
+ else if (eth_proto == bpf_htons(ETH_P_IPV6))
+ err = process_packet(&ptr, eth, nh_off, true, ctx);
+ else
+ return TC_ACT_SHOT;
+
+ if (eth == buffer)
+ bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+
+ return err;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt.c b/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt.c
new file mode 100644
index 000000000000..79bab9b50e9e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* This parsing logic is taken from the open source library katran, a layer 4
+ * load balancer.
+ *
+ * This code logic using dynptrs can be found in test_parse_tcp_hdr_opt_dynptr.c
+ *
+ * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
+ */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/tcp.h>
+#include <stdbool.h>
+#include <linux/ipv6.h>
+#include <linux/if_ether.h>
+#include "test_tcp_hdr_options.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Kind number used for experiments */
+const __u32 tcp_hdr_opt_kind_tpr = 0xFD;
+/* Length of the tcp header option */
+const __u32 tcp_hdr_opt_len_tpr = 6;
+/* maximum number of header options to check to lookup server_id */
+const __u32 tcp_hdr_opt_max_opt_checks = 15;
+
+__u32 server_id;
+
+struct hdr_opt_state {
+ __u32 server_id;
+ __u8 byte_offset;
+ __u8 hdr_bytes_remaining;
+};
+
+static int parse_hdr_opt(const struct xdp_md *xdp, struct hdr_opt_state *state)
+{
+ const void *data = (void *)(long)xdp->data;
+ const void *data_end = (void *)(long)xdp->data_end;
+ __u8 *tcp_opt, kind, hdr_len;
+
+ tcp_opt = (__u8 *)(data + state->byte_offset);
+ if (tcp_opt + 1 > data_end)
+ return -1;
+
+ kind = tcp_opt[0];
+
+ if (kind == TCPOPT_EOL)
+ return -1;
+
+ if (kind == TCPOPT_NOP) {
+ state->hdr_bytes_remaining--;
+ state->byte_offset++;
+ return 0;
+ }
+
+ if (state->hdr_bytes_remaining < 2 ||
+ tcp_opt + sizeof(__u8) + sizeof(__u8) > data_end)
+ return -1;
+
+ hdr_len = tcp_opt[1];
+ if (hdr_len > state->hdr_bytes_remaining)
+ return -1;
+
+ if (kind == tcp_hdr_opt_kind_tpr) {
+ if (hdr_len != tcp_hdr_opt_len_tpr)
+ return -1;
+
+ if (tcp_opt + tcp_hdr_opt_len_tpr > data_end)
+ return -1;
+
+ state->server_id = *(__u32 *)&tcp_opt[2];
+ return 1;
+ }
+
+ state->hdr_bytes_remaining -= hdr_len;
+ state->byte_offset += hdr_len;
+ return 0;
+}
+
+SEC("xdp")
+int xdp_ingress_v6(struct xdp_md *xdp)
+{
+ const void *data = (void *)(long)xdp->data;
+ const void *data_end = (void *)(long)xdp->data_end;
+ struct hdr_opt_state opt_state = {};
+ __u8 tcp_hdr_opt_len = 0;
+ struct tcphdr *tcp_hdr;
+ __u64 tcp_offset = 0;
+ __u32 off;
+ int err;
+
+ tcp_offset = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
+ tcp_hdr = (struct tcphdr *)(data + tcp_offset);
+ if (tcp_hdr + 1 > data_end)
+ return XDP_DROP;
+
+ tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
+ if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
+ return XDP_DROP;
+
+ opt_state.hdr_bytes_remaining = tcp_hdr_opt_len;
+ opt_state.byte_offset = sizeof(struct tcphdr) + tcp_offset;
+
+ /* max number of bytes of options in tcp header is 40 bytes */
+ for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
+ err = parse_hdr_opt(xdp, &opt_state);
+
+ if (err || !opt_state.hdr_bytes_remaining)
+ break;
+ }
+
+ if (!opt_state.server_id)
+ return XDP_DROP;
+
+ server_id = opt_state.server_id;
+
+ return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt_dynptr.c b/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt_dynptr.c
new file mode 100644
index 000000000000..d3b319722e30
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_parse_tcp_hdr_opt_dynptr.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* This logic is lifted from a real-world use case of packet parsing, used in
+ * the open source library katran, a layer 4 load balancer.
+ *
+ * This test demonstrates how to parse packet contents using dynptrs. The
+ * original code (parsing without dynptrs) can be found in test_parse_tcp_hdr_opt.c
+ */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/tcp.h>
+#include <stdbool.h>
+#include <linux/ipv6.h>
+#include <linux/if_ether.h>
+#include "test_tcp_hdr_options.h"
+#include "bpf_kfuncs.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Kind number used for experiments */
+const __u32 tcp_hdr_opt_kind_tpr = 0xFD;
+/* Length of the tcp header option */
+const __u32 tcp_hdr_opt_len_tpr = 6;
+/* maximum number of header options to check to lookup server_id */
+const __u32 tcp_hdr_opt_max_opt_checks = 15;
+
+__u32 server_id;
+
+static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
+ __u32 *server_id)
+{
+ __u8 *tcp_opt, kind, hdr_len;
+ __u8 buffer[sizeof(kind) + sizeof(hdr_len) + sizeof(*server_id)];
+ __u8 *data;
+
+ __builtin_memset(buffer, 0, sizeof(buffer));
+
+ data = bpf_dynptr_slice(ptr, *off, buffer, sizeof(buffer));
+ if (!data)
+ return -1;
+
+ kind = data[0];
+
+ if (kind == TCPOPT_EOL)
+ return -1;
+
+ if (kind == TCPOPT_NOP) {
+ *off += 1;
+ *hdr_bytes_remaining -= 1;
+ return 0;
+ }
+
+ if (*hdr_bytes_remaining < 2)
+ return -1;
+
+ hdr_len = data[1];
+ if (hdr_len > *hdr_bytes_remaining)
+ return -1;
+
+ if (kind == tcp_hdr_opt_kind_tpr) {
+ if (hdr_len != tcp_hdr_opt_len_tpr)
+ return -1;
+
+ __builtin_memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
+ return 1;
+ }
+
+ *off += hdr_len;
+ *hdr_bytes_remaining -= hdr_len;
+ return 0;
+}
+
+SEC("xdp")
+int xdp_ingress_v6(struct xdp_md *xdp)
+{
+ __u8 buffer[sizeof(struct tcphdr)] = {};
+ __u8 hdr_bytes_remaining;
+ struct tcphdr *tcp_hdr;
+ __u8 tcp_hdr_opt_len;
+ int err = 0;
+ __u32 off;
+
+ struct bpf_dynptr ptr;
+
+ bpf_dynptr_from_xdp(xdp, 0, &ptr);
+
+ off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
+
+ tcp_hdr = bpf_dynptr_slice(&ptr, off, buffer, sizeof(buffer));
+ if (!tcp_hdr)
+ return XDP_DROP;
+
+ tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
+ if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
+ return XDP_DROP;
+
+ hdr_bytes_remaining = tcp_hdr_opt_len;
+
+ off += sizeof(struct tcphdr);
+
+ /* max number of bytes of options in tcp header is 40 bytes */
+ for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
+ err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
+
+ if (err || !hdr_bytes_remaining)
+ break;
+ }
+
+ if (!server_id)
+ return XDP_DROP;
+
+ return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_dynptr.c b/tools/testing/selftests/bpf/progs/test_xdp_dynptr.c
new file mode 100644
index 000000000000..7521a805b506
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_dynptr.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "test_iptunnel_common.h"
+#include "bpf_kfuncs.h"
+
+const size_t tcphdr_sz = sizeof(struct tcphdr);
+const size_t udphdr_sz = sizeof(struct udphdr);
+const size_t ethhdr_sz = sizeof(struct ethhdr);
+const size_t iphdr_sz = sizeof(struct iphdr);
+const size_t ipv6hdr_sz = sizeof(struct ipv6hdr);
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 256);
+ __type(key, __u32);
+ __type(value, __u64);
+} rxcnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_IPTNL_ENTRIES);
+ __type(key, struct vip);
+ __type(value, struct iptnl_info);
+} vip2tnl SEC(".maps");
+
+static __always_inline void count_tx(__u32 protocol)
+{
+ __u64 *rxcnt_count;
+
+ rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+ if (rxcnt_count)
+ *rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, __u8 protocol)
+{
+ struct tcphdr *th;
+ struct udphdr *uh;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)trans_data;
+ return th->dest;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)trans_data;
+ return uh->dest;
+ default:
+ return 0;
+ }
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+ const struct ethhdr *old_eth,
+ const struct iptnl_info *tnl,
+ __be16 h_proto)
+{
+ memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+ memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+ new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
+{
+ __u8 eth_buffer[ethhdr_sz + iphdr_sz + ethhdr_sz];
+ __u8 iph_buffer_tcp[iphdr_sz + tcphdr_sz];
+ __u8 iph_buffer_udp[iphdr_sz + udphdr_sz];
+ struct bpf_dynptr new_xdp_ptr;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ __u32 transport_hdr_sz;
+ struct iphdr *iph;
+ __u16 *next_iph;
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+ __u32 csum = 0;
+ int i;
+
+ __builtin_memset(eth_buffer, 0, sizeof(eth_buffer));
+ __builtin_memset(iph_buffer_tcp, 0, sizeof(iph_buffer_tcp));
+ __builtin_memset(iph_buffer_udp, 0, sizeof(iph_buffer_udp));
+
+ if (ethhdr_sz + iphdr_sz + tcphdr_sz > xdp->data_end - xdp->data)
+ iph = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, iph_buffer_udp, sizeof(iph_buffer_udp));
+ else
+ iph = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, iph_buffer_tcp, sizeof(iph_buffer_tcp));
+
+ if (!iph)
+ return XDP_DROP;
+
+ dport = get_dport(iph + 1, iph->protocol);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = iph->protocol;
+ vip.family = AF_INET;
+ vip.daddr.v4 = iph->daddr;
+ vip.dport = dport;
+ payload_len = bpf_ntohs(iph->tot_len);
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v4-in-v4 */
+ if (!tnl || tnl->family != AF_INET)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)iphdr_sz))
+ return XDP_DROP;
+
+ bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
+ new_eth = bpf_dynptr_slice_rdwr(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer));
+ if (!new_eth)
+ return XDP_DROP;
+
+ iph = (struct iphdr *)(new_eth + 1);
+ old_eth = (struct ethhdr *)(iph + 1);
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
+
+ if (new_eth == eth_buffer)
+ bpf_dynptr_write(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer), 0);
+
+ iph->version = 4;
+ iph->ihl = iphdr_sz >> 2;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 0;
+ iph->tot_len = bpf_htons(payload_len + iphdr_sz);
+ iph->daddr = tnl->daddr.v4;
+ iph->saddr = tnl->saddr.v4;
+ iph->ttl = 8;
+
+ next_iph = (__u16 *)iph;
+ for (i = 0; i < iphdr_sz >> 1; i++)
+ csum += *next_iph++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
+{
+ __u8 eth_buffer[ethhdr_sz + ipv6hdr_sz + ethhdr_sz];
+ __u8 ip6h_buffer_tcp[ipv6hdr_sz + tcphdr_sz];
+ __u8 ip6h_buffer_udp[ipv6hdr_sz + udphdr_sz];
+ struct bpf_dynptr new_xdp_ptr;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ __u32 transport_hdr_sz;
+ struct ipv6hdr *ip6h;
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+
+ __builtin_memset(eth_buffer, 0, sizeof(eth_buffer));
+ __builtin_memset(ip6h_buffer_tcp, 0, sizeof(ip6h_buffer_tcp));
+ __builtin_memset(ip6h_buffer_udp, 0, sizeof(ip6h_buffer_udp));
+
+ if (ethhdr_sz + iphdr_sz + tcphdr_sz > xdp->data_end - xdp->data)
+ ip6h = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, ip6h_buffer_udp, sizeof(ip6h_buffer_udp));
+ else
+ ip6h = bpf_dynptr_slice(xdp_ptr, ethhdr_sz, ip6h_buffer_tcp, sizeof(ip6h_buffer_tcp));
+
+ if (!ip6h)
+ return XDP_DROP;
+
+ dport = get_dport(ip6h + 1, ip6h->nexthdr);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = ip6h->nexthdr;
+ vip.family = AF_INET6;
+ memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+ vip.dport = dport;
+ payload_len = ip6h->payload_len;
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v6-in-v6 */
+ if (!tnl || tnl->family != AF_INET6)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)ipv6hdr_sz))
+ return XDP_DROP;
+
+ bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
+ new_eth = bpf_dynptr_slice_rdwr(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer));
+ if (!new_eth)
+ return XDP_DROP;
+
+ ip6h = (struct ipv6hdr *)(new_eth + 1);
+ old_eth = (struct ethhdr *)(ip6h + 1);
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
+
+ if (new_eth == eth_buffer)
+ bpf_dynptr_write(&new_xdp_ptr, 0, eth_buffer, sizeof(eth_buffer), 0);
+
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+ ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + ipv6hdr_sz);
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip6h->hop_limit = 8;
+ memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+ memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+SEC("xdp")
+int _xdp_tx_iptunnel(struct xdp_md *xdp)
+{
+ __u8 buffer[ethhdr_sz];
+ struct bpf_dynptr ptr;
+ struct ethhdr *eth;
+ __u16 h_proto;
+
+ __builtin_memset(buffer, 0, sizeof(buffer));
+
+ bpf_dynptr_from_xdp(xdp, 0, &ptr);
+ eth = bpf_dynptr_slice(&ptr, 0, buffer, sizeof(buffer));
+ if (!eth)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == bpf_htons(ETH_P_IP))
+ return handle_ipv4(xdp, &ptr);
+ else if (h_proto == bpf_htons(ETH_P_IPV6))
+
+ return handle_ipv6(xdp, &ptr);
+ else
+ return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
index 6118e3ab61fc..56c9f8a3ad3d 100644
--- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h
+++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
@@ -50,6 +50,7 @@ struct linum_err {
#define TCPOPT_EOL 0
#define TCPOPT_NOP 1
+#define TCPOPT_MSS 2
#define TCPOPT_WINDOW 3
#define TCPOPT_EXP 254