aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf-cgroup.h18
-rw-r--r--include/linux/bpf_types.h1
-rw-r--r--include/linux/filter.h9
-rw-r--r--include/net/tcp.h66
-rw-r--r--include/uapi/linux/bpf.h68
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--net/core/filter.c284
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_fastopen.c1
-rw-r--r--net/ipv4/tcp_input.c9
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c18
-rw-r--r--samples/bpf/Makefile9
-rw-r--r--samples/bpf/bpf_helpers.h3
-rw-r--r--samples/bpf/bpf_load.c13
-rw-r--r--samples/bpf/load_sock_ops.c97
-rw-r--r--samples/bpf/tcp_bufs_kern.c86
-rw-r--r--samples/bpf/tcp_clamp_kern.c102
-rw-r--r--samples/bpf/tcp_cong_kern.c83
-rw-r--r--samples/bpf/tcp_iw_kern.c88
-rw-r--r--samples/bpf/tcp_rwnd_kern.c69
-rw-r--r--samples/bpf/tcp_synrto_kern.c69
-rw-r--r--tools/include/uapi/linux/bpf.h66
25 files changed, 1218 insertions, 26 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c970a25d2a49..360c082e885c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -7,6 +7,7 @@
struct sock;
struct cgroup;
struct sk_buff;
+struct bpf_sock_ops_kern;
#ifdef CONFIG_CGROUP_BPF
@@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type);
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type);
+
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
@@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
__ret; \
})
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled && (sock_ops)->sk) { \
+ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \
+ if (sk_fullsock(__sk)) \
+ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \
+ sock_ops, \
+ BPF_CGROUP_SOCK_OPS); \
+ } \
+ __ret; \
+})
#else
struct cgroup_bpf {};
@@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
#endif /* CONFIG_CGROUP_BPF */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 03bf223f18be..3d137c33d664 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
#endif
#ifdef CONFIG_BPF_EVENTS
BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1fa26dc562ce..738f8b14f025 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void)
return SKF_AD_MAX;
}
+struct bpf_sock_ops_kern {
+ struct sock *sk;
+ u32 op;
+ union {
+ u32 reply;
+ u32 replylong[4];
+ };
+};
+
#endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0751b79d99c..70483296157f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,10 @@
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
+
extern struct inet_hashinfo tcp_hashinfo;
extern struct percpu_counter tcp_orphan_count;
@@ -1000,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
+void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
@@ -2021,4 +2027,62 @@ int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
+/* Call BPF_SOCK_OPS program that returns an int. If the return value
+ * is < 0, then the BPF op failed (for example if the loaded BPF
+ * program does not support the chosen operation or there is no BPF
+ * program loaded).
+ */
+#ifdef CONFIG_BPF
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int ret;
+
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, sizeof(sock_ops));
+ sock_ops.sk = sk;
+ sock_ops.op = op;
+
+ ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+ if (ret == 0)
+ ret = sock_ops.reply;
+ else
+ ret = -1;
+ return ret;
+}
+#else
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+ return -EPERM;
+}
+#endif
+
+static inline u32 tcp_timeout_init(struct sock *sk)
+{
+ int timeout;
+
+ timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+
+ if (timeout <= 0)
+ timeout = TCP_TIMEOUT_INIT;
+ return timeout;
+}
+
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
+{
+ int rwnd;
+
+ rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+
+ if (rwnd < 0)
+ rwnd = 0;
+ return rwnd;
+}
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+ return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+}
#endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f94b48b168dc..a6a91e5e96fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
+ BPF_PROG_TYPE_SOCK_OPS,
};
enum bpf_attach_type {
BPF_CGROUP_INET_INGRESS,
BPF_CGROUP_INET_EGRESS,
BPF_CGROUP_INET_SOCK_CREATE,
+ BPF_CGROUP_SOCK_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -518,6 +520,17 @@ union bpf_attr {
* Set full skb->hash.
* @skb: pointer to skb
* @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ * Calls setsockopt. Not all opts are available, only those with
+ * integer optvals plus TCP_CONGESTION.
+ * Supported levels: SOL_SOCKET and IPROTO_TCP
+ * @bpf_socket: pointer to bpf_socket
+ * @level: SOL_SOCKET or IPROTO_TCP
+ * @optname: option name
+ * @optval: pointer to option value
+ * @optlen: length of optval in byes
+ * Return: 0 or negative error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -568,7 +581,8 @@ union bpf_attr {
FN(probe_read_str), \
FN(get_socket_cookie), \
FN(get_socket_uid), \
- FN(set_hash),
+ FN(set_hash), \
+ FN(setsockopt),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -720,4 +734,56 @@ struct bpf_map_info {
__u32 map_flags;
} __attribute__((aligned(8)));
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+ __u32 op;
+ union {
+ __u32 reply;
+ __u32 replylong[4];
+ };
+ __u32 family;
+ __u32 remote_ip4; /* Stored in network byte order */
+ __u32 local_ip4; /* Stored in network byte order */
+ __u32 remote_ip6[4]; /* Stored in network byte order */
+ __u32 local_ip6[4]; /* Stored in network byte order */
+ __u32 remote_port; /* Stored in network byte order */
+ __u32 local_port; /* stored in host byte order */
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+ BPF_SOCK_OPS_VOID,
+ BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or
+ * -1 if default value should be used
+ */
+ BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+ * window (in packets) or -1 if default
+ * value should be used
+ */
+ BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an
+ * active connection is initialized
+ */
+ BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+ * active connection is
+ * established
+ */
+ BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a
+ * passive connection is
+ * established
+ */
+ BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+ * needs ECN
+ */
+};
+
+#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4409ccca8831..d4d47de75bba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_SOCK_OPS:
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
default:
return -EINVAL;
}
@@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
@@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
+
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
diff --git a/net/core/filter.c b/net/core/filter.c
index b39c869d22e3..523b91d25025 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
+#include <net/tcp.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -2672,6 +2673,110 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ struct sock *sk = bpf_sock->sk;
+ int ret = 0;
+ int val;
+
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET) {
+ if (optlen != sizeof(int))
+ return -EINVAL;
+ val = *((int *)optval);
+
+ /* Only some socketops are supported */
+ switch (optname) {
+ case SO_RCVBUF:
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ break;
+ case SO_SNDBUF:
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ break;
+ case SO_MAX_PACING_RATE:
+ sk->sk_max_pacing_rate = val;
+ sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+ sk->sk_max_pacing_rate);
+ break;
+ case SO_PRIORITY:
+ sk->sk_priority = val;
+ break;
+ case SO_RCVLOWAT:
+ if (val < 0)
+ val = INT_MAX;
+ sk->sk_rcvlowat = val ? : 1;
+ break;
+ case SO_MARK:
+ sk->sk_mark = val;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ } else if (level == SOL_TCP &&
+ sk->sk_prot->setsockopt == tcp_setsockopt) {
+#ifdef CONFIG_INET
+ if (optname == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+
+ strncpy(name, optval, min_t(long, optlen,
+ TCP_CA_NAME_MAX-1));
+ name[TCP_CA_NAME_MAX-1] = 0;
+ ret = tcp_set_congestion_control(sk, name, false);
+ if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+ /* replacing an existing ca */
+ tcp_reinit_congestion_control(sk,
+ inet_csk(sk)->icsk_ca_ops);
+ } else {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > 0)
+ ret = -EINVAL;
+ else
+ tp->snd_cwnd = val;
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0) {
+ ret = -EINVAL;
+ } else {
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ }
+ default:
+ ret = -EINVAL;
+ }
+ }
+#else
+ ret = -EINVAL;
+#endif
+ } else {
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+ .func = bpf_setsockopt,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -2823,6 +2928,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
+ sock_ops_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_setsockopt_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -3110,6 +3226,36 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static bool __is_valid_sock_ops_access(int off, int size)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u32))
+ return false;
+
+ return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sock_ops, op) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return __is_valid_sock_ops_access(off, size);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -3379,6 +3525,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_ops, op) ...
+ offsetof(struct bpf_sock_ops, replylong[3]):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
+ FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, op);
+ off += offsetof(struct bpf_sock_ops_kern, op);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ off);
+ break;
+
+ case offsetof(struct bpf_sock_ops, family):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_family));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_daddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip4):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_rcv_saddr));
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+ offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]) +
+ off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, remote_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, local_port):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_num));
+ break;
+ }
+ return insn - insn_buf;
+}
+
const struct bpf_verifier_ops sk_filter_prog_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -3428,6 +3706,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
.convert_ctx_access = sock_filter_convert_ctx_access,
};
+const struct bpf_verifier_ops sock_ops_prog_ops = {
+ .get_func_proto = sock_ops_func_proto,
+ .is_valid_access = sock_ops_is_valid_access,
+ .convert_ctx_access = sock_ops_convert_ctx_access,
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fae45e402742..71ce33decd97 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name);
+ err = tcp_set_congestion_control(sk, name, true);
release_sock(sk);
return err;
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bcc5456..fde983f6376b 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk);
}
-static void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca)
+void tcp_reinit_congestion_control(struct sock *sk,
+ const struct tcp_congestion_ops *ca)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -333,8 +333,12 @@ out:
return ret;
}
-/* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return -EPERM;
rcu_read_lock();
- ca = __tcp_ca_find_autoload(name);
+ if (!load)
+ ca = tcp_ca_find(name);
+ else
+ ca = __tcp_ca_find_autoload(name);
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) {
icsk->icsk_ca_setsockopt = 1;
goto out;
}
- if (!ca)
+ if (!ca) {
err = -ENOENT;
- else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+ } else if (!load) {
+ icsk->icsk_ca_ops = ca;
+ if (!try_module_get(ca->owner))
+ err = -EBUSY;
+ } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
+ ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
err = -EPERM;
- else if (!try_module_get(ca->owner))
+ } else if (!try_module_get(ca->owner)) {
err = -EBUSY;
- else
+ } else {
tcp_reinit_congestion_control(sk, ca);
+ }
out:
rcu_read_unlock();
return err;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8b1539efaf38..ce9c7fef200f 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_init_congestion_control(child);
tcp_mtup_init(child);
tcp_init_metrics(child);
+ tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_buffer_space(child);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ab7e2fa9bb9..2920e0cb09f8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
-
+ tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
/* Prevent spurious tcp_cwnd_restart() on first data
@@ -5977,6 +5977,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
tcp_mtup_init(sk);
@@ -6190,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
- (ecn_ok_dst & DST_FEATURE_ECN_CA))
+ (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
}
@@ -6406,7 +6408,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ inet_csk_reqsk_queue_hash_add(sk, req,
+ tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d30ee31e94eb..0ff83c1637d8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
int full_space = tcp_full_space(sk_listener);
u32 window_clamp;
__u8 rcv_wscale;
+ u32 rcv_wnd;
int mss;
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
+ rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ else if (full_space < rcv_wnd * mss)
+ full_space = rcv_wnd * mss;
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
+ rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d79137f3795..4d36f0b093e6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
- else if (tcp_ca_needs_ecn(sk))
+ else if (tcp_ca_needs_ecn(sk) ||
+ tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
}
@@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
- tcp_ca_needs_ecn(sk);
+ tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
@@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
- if (tcp_ca_needs_ecn(sk))
+ if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
}
}
@@ -3266,6 +3268,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3299,13 +3302,17 @@ static void tcp_connect_init(struct sock *sk)
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
tp->window_clamp = tcp_full_space(sk);
+ rcv_wnd = tcp_rwnd_init_bpf(sk);
+ if (rcv_wnd == 0)
+ rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
+ rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
@@ -3326,7 +3333,7 @@ static void tcp_connect_init(struct sock *sk)
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
- inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+ inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
@@ -3439,6 +3446,7 @@ int tcp_connect(struct sock *sk)
struct sk_buff *buff;
int err;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
tcp_connect_init(sk);
if (unlikely(tp->repair)) {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e7ec9b8539a5..9c650589e80f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -36,6 +36,7 @@ hostprogs-y += lwt_len_hist
hostprogs-y += xdp_tx_iptunnel
hostprogs-y += test_map_in_map
hostprogs-y += per_socket_stats_example
+hostprogs-y += load_sock_ops
# Libbpf dependencies
LIBBPF := ../../tools/lib/bpf/bpf.o
@@ -52,6 +53,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
@@ -111,6 +113,12 @@ always += lwt_len_hist_kern.o
always += xdp_tx_iptunnel_kern.o
always += test_map_in_map_kern.o
always += cookie_uid_helper_example.o
+always += tcp_synrto_kern.o
+always += tcp_rwnd_kern.o
+always += tcp_bufs_kern.o
+always += tcp_cong_kern.o
+always += tcp_iw_kern.o
+always += tcp_clamp_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -130,6 +138,7 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
+HOSTLOADLIBES_load_sock_ops += -lelf
HOSTLOADLIBES_test_probe_write_user += -lelf
HOSTLOADLIBES_trace_output += -lelf -lrt
HOSTLOADLIBES_lathist += -lelf
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f4840b8bb8f9..d50ac342dc92 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -60,6 +60,9 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
(void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
+ int optlen) =
+ (void *) BPF_FUNC_setsockopt;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index a91c57dd8571..a4be7cfa6519 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
+ bool is_sockops = strncmp(event, "sockops", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
prog_type = BPF_PROG_TYPE_CGROUP_SKB;
} else if (is_cgroup_sk) {
prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+ } else if (is_sockops) {
+ prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
return 0;
- if (is_socket) {
- event += 6;
+ if (is_socket || is_sockops) {
+ if (is_socket)
+ event += 6;
+ else
+ event += 7;
if (*event != '/')
return 0;
event++;
@@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
memcmp(shname, "xdp", 3) == 0 ||
memcmp(shname, "perf_event", 10) == 0 ||
memcmp(shname, "socket", 6) == 0 ||
- memcmp(shname, "cgroup/", 7) == 0)
+ memcmp(shname, "cgroup/", 7) == 0 ||
+ memcmp(shname, "sockops", 7) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
new file mode 100644
index 000000000000..e5da6cf71a3e
--- /dev/null
+++ b/samples/bpf/load_sock_ops.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+static void usage(char *pname)
+{
+ printf("USAGE:\n %s [-l] <cg-path> <prog filename>\n", pname);
+ printf("\tLoad and attach a sock_ops program to the specified "
+ "cgroup\n");
+ printf("\tIf \"-l\" is used, the program will continue to run\n");
+ printf("\tprinting the BPF log buffer\n");
+ printf("\tIf the specified filename does not end in \".o\", it\n");
+ printf("\tappends \"_kern.o\" to the name\n");
+ printf("\n");
+ printf(" %s -r <cg-path>\n", pname);
+ printf("\tDetaches the currently attached sock_ops program\n");
+ printf("\tfrom the specified cgroup\n");
+ printf("\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int logFlag = 0;
+ int error = 0;
+ char *cg_path;
+ char fn[500];
+ char *prog;
+ int cg_fd;
+
+ if (argc < 3)
+ usage(argv[0]);
+
+ if (!strcmp(argv[1], "-r")) {
+ cg_path = argv[2];
+ cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+ error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+ if (error) {
+ printf("ERROR: bpf_prog_detach: %d (%s)\n",
+ error, strerror(errno));
+ return 2;
+ }
+ return 0;
+ } else if (!strcmp(argv[1], "-h")) {
+ usage(argv[0]);
+ } else if (!strcmp(argv[1], "-l")) {
+ logFlag = 1;
+ if (argc < 4)
+ usage(argv[0]);
+ }
+
+ prog = argv[argc - 1];
+ cg_path = argv[argc - 2];
+ if (strlen(prog) > 480) {
+ fprintf(stderr, "ERROR: program name too long (> 480 chars)\n");
+ return 3;
+ }
+ cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+
+ if (!strcmp(prog + strlen(prog)-2, ".o"))
+ strcpy(fn, prog);
+ else
+ sprintf(fn, "%s_kern.o", prog);
+ if (logFlag)
+ printf("loading bpf file:%s\n", fn);
+ if (load_bpf_file(fn)) {
+ printf("ERROR: load_bpf_file failed for: %s\n", fn);
+ printf("%s", bpf_log_buf);
+ return 4;
+ }
+ if (logFlag)
+ printf("TCP BPF Loaded %s\n", fn);
+
+ error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (error) {
+ printf("ERROR: bpf_prog_attach: %d (%s)\n",
+ error, strerror(errno));
+ return 5;
+ } else if (logFlag) {
+ read_trace_pipe();
+ }
+
+ return error;
+}
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000000000000..ee83bbabd17c
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,86 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets and send
+ * and receive buffers to 1.5MB. This would usually be done after
+ * doing appropriate checks that indicate the hosts are far enough
+ * away (i.e. large RTT).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_bufs(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ /* Nothing to do */
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000000000000..d68eadd9ca2d
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,102 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
+ * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
+ * the same datacenter. For his example, we assume they are within the same
+ * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_clamp(struct bpf_sock_ops *skops)
+{
+ int bufsize = 150000;
+ int to_init = 10;
+ int clamp = 100;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check that both hosts are within same datacenter. For this example
+ * it is the case when the first 5.5 bytes of their IPv6 addresses are
+ * the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_TIMEOUT_INIT:
+ rv = to_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
+ &bufsize, sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_TCP,
+ TCP_BPF_SNDCWND_CLAMP,
+ &clamp, sizeof(clamp));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000000000000..dac15bce1fa9
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,83 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set congestion control to dctcp when both hosts are
+ * in the same datacenter (as deteremined by IPv6 prefix).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_cong(struct bpf_sock_ops *skops)
+{
+ char cong[] = "dctcp";
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_NEEDS_ECN:
+ rv = 1;
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000000000000..23c5122ef819
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,88 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial congestion window and initial receive
+ * window to 40 packets and send and receive buffers to 1.5MB. This
+ * would usually be done after doing appropriate checks that indicate
+ * the hosts are far enough away (i.e. large RTT).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_iw(struct bpf_sock_ops *skops)
+{
+ int bufsize = 1500000;
+ int rwnd_init = 40;
+ int iw = 40;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Usually there would be a check to insure the hosts are far
+ * from each other so it makes sense to increase buffer sizes
+ */
+ switch (op) {
+ case BPF_SOCK_OPS_RWND_INIT:
+ rv = rwnd_init;
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ /* Set sndbuf and rcvbuf of active connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
+ sizeof(iw));
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ /* Set sndbuf and rcvbuf of passive connections */
+ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000000000000..3f2a228f81ce
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets when using IPv6
+ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
+ * example that means both hosts are not the same datacenter).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_rwnd(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) !=
+ 55601 && skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for RWND_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_RWND_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are not the same
+ * then both hosts are not in the same datacenter
+ * so use a larger initial advertized window (40 packets)
+ */
+ if (skops->local_ip6[0] != skops->remote_ip6[0] ||
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfffff000) !=
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfffff000))
+ rv = 40;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000000000000..3c3fc83d81cb
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,69 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
+ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
+ * that means both hosts are in the same datacenter).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_synrto(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 55601
+ */
+ if (bpf_ntohl(skops->remote_port) != 55601 &&
+ skops->local_port != 55601)
+ return -1;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check for TIMEOUT_INIT operation and IPv6 addresses */
+ if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
+ skops->family == AF_INET6) {
+
+ /* If the first 5.5 bytes of the IPv6 address are the same
+ * then both hosts are in the same datacenter
+ * so use an RTO of 10ms
+ */
+ if (skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000))
+ rv = 10;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f94b48b168dc..284b3661f1df 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
+ BPF_PROG_TYPE_SOCK_OPS,
};
enum bpf_attach_type {
BPF_CGROUP_INET_INGRESS,
BPF_CGROUP_INET_EGRESS,
BPF_CGROUP_INET_SOCK_CREATE,
+ BPF_CGROUP_SOCK_OPS,
__MAX_BPF_ATTACH_TYPE
};
@@ -518,6 +520,17 @@ union bpf_attr {
* Set full skb->hash.
* @skb: pointer to skb
* @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ * Calls setsockopt. Not all opts are available, only those with
+ * integer optvals plus TCP_CONGESTION.
+ * Supported levels: SOL_SOCKET and IPROTO_TCP
+ * @bpf_socket: pointer to bpf_socket
+ * @level: SOL_SOCKET or IPROTO_TCP
+ * @optname: option name
+ * @optval: pointer to option value
+ * @optlen: length of optval in byes
+ * Return: 0 or negative error
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -568,7 +581,8 @@ union bpf_attr {
FN(probe_read_str), \
FN(get_socket_cookie), \
FN(get_socket_uid), \
- FN(set_hash),
+ FN(set_hash), \
+ FN(setsockopt),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -720,4 +734,54 @@ struct bpf_map_info {
__u32 map_flags;
} __attribute__((aligned(8)));
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+ __u32 op;
+ union {
+ __u32 reply;
+ __u32 replylong[4];
+ };
+ __u32 family;
+ __u32 remote_ip4;
+ __u32 local_ip4;
+ __u32 remote_ip6[4];
+ __u32 local_ip6[4];
+ __u32 remote_port;
+ __u32 local_port;
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+ BPF_SOCK_OPS_VOID,
+ BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or
+ * -1 if default value should be used
+ */
+ BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+ * window (in packets) or -1 if default
+ * value should be used
+ */
+ BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an
+ * active connection is initialized
+ */
+ BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+ * active connection is
+ * established
+ */
+ BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a
+ * passive connection is
+ * established
+ */
+ BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+ * needs ECN
+ */
+};
+
+#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
+
#endif /* _UAPI__LINUX_BPF_H__ */