From e9cdced78dc20c1592c1fb98ed064943007a46c5 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:14 -0800 Subject: net: Make sock protocol value checks more specific SK_PROTOCOL_MAX is only used in two places, for DECNet and AX.25. The limits have more to do with the those protocol definitions than they do with the data type of sk_protocol, so remove SK_PROTOCOL_MAX and use U8_MAX directly. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 - net/ax25/af_ax25.c | 2 +- net/decnet/af_decnet.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 8dff68b4c316..091e55428415 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -458,7 +458,6 @@ struct sock { sk_userlocks : 4, sk_protocol : 8, sk_type : 16; -#define SK_PROTOCOL_MAX U8_MAX u16 sk_gso_max_segs; u8 sk_pacing_shift; unsigned long sk_lingertime; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 324306d6fde0..ff57ea89c27e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -808,7 +808,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, struct sock *sk; ax25_cb *ax25; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index e19a92a62e14..0a46ea3bddd5 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -670,7 +670,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; - if (protocol < 0 || protocol > SK_PROTOCOL_MAX) + if (protocol < 0 || protocol > U8_MAX) return -EINVAL; if (!net_eq(net, &init_net)) -- cgit From bf9765145b856fa2e238a5b8a54453795ba30ad6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:15 -0800 Subject: sock: Make sk_protocol a 16-bit value Match the 16-bit width of skbuff->protocol. Fills an 8-bit hole so sizeof(struct sock) does not change. Also take care of BPF field access for sk_type/sk_protocol. Both of them are now outside the bitfield, so we can use load instructions without further shifting/masking. v5 -> v6: - update eBPF accessors, too (Intel's kbuild test robot) v2 -> v3: - keep 'sk_type' 2 bytes aligned (Eric) v1 -> v2: - preserve sk_pacing_shift as bit field (Eric) Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 25 ++++--------------- include/trace/events/sock.h | 2 +- net/core/filter.c | 60 +++++++++++++++++---------------------------- 3 files changed, 28 insertions(+), 59 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 091e55428415..8766f9bc3e70 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,30 +436,15 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ - unsigned int __sk_flags_offset[0]; -#ifdef __BIG_ENDIAN_BITFIELD -#define SK_FL_PROTO_SHIFT 16 -#define SK_FL_PROTO_MASK 0x00ff0000 - -#define SK_FL_TYPE_SHIFT 0 -#define SK_FL_TYPE_MASK 0x0000ffff -#else -#define SK_FL_PROTO_SHIFT 8 -#define SK_FL_PROTO_MASK 0x0000ff00 - -#define SK_FL_TYPE_SHIFT 16 -#define SK_FL_TYPE_MASK 0xffff0000 -#endif - - unsigned int sk_padding : 1, + u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, - sk_userlocks : 4, - sk_protocol : 8, - sk_type : 16; - u16 sk_gso_max_segs; + sk_userlocks : 4; u8 sk_pacing_shift; + u16 sk_type; + u16 sk_protocol; + u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 51fe9f6719eb..3ff12b90048d 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -147,7 +147,7 @@ TRACE_EVENT(inet_sock_set_state, __field(__u16, sport) __field(__u16, dport) __field(__u16, family) - __field(__u8, protocol) + __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) diff --git a/net/core/filter.c b/net/core/filter.c index 42fd17c48c5f..ef01c5599501 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7607,21 +7607,21 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, type): - BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); - *target_size = 2; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_type), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_type, + sizeof_field(struct sock, sk_type), + target_size)); break; case offsetof(struct bpf_sock, protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); - *target_size = 1; + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_protocol), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_protocol, + sizeof_field(struct sock, sk_protocol), + target_size)); break; case offsetof(struct bpf_sock, src_ip4): @@ -7903,20 +7903,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock_addr, type): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( - struct bpf_sock_addr_kern, struct sock, sk, - __sk_flags_offset, BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): @@ -8835,11 +8828,11 @@ sk_reuseport_is_valid_access(int off, int size, skb, \ SKB_FIELD) -#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ - SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ - struct sock, \ - sk, \ - SK_FIELD, BPF_SIZE, EXTRA_OFF) +#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, @@ -8863,16 +8856,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct sk_reuseport_md, ip_protocol): - BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); - SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, - BPF_W, 0); - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, - SK_FL_PROTO_SHIFT); - /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian - * aware. No further narrowing or masking is needed. - */ - *target_size = 1; + SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): -- cgit From faf391c3826cd29feae02078ca2022d2f912f7cc Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:16 -0800 Subject: tcp: Define IPPROTO_MPTCP To open a MPTCP socket with socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP), IPPROTO_MPTCP needs a value that differs from IPPROTO_TCP. The existing IPPROTO numbers mostly map directly to IANA-specified protocol numbers. MPTCP does not have a protocol number allocated because MPTCP packets use the TCP protocol number. Use private number not used OTA. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/trace/events/sock.h | 3 ++- include/uapi/linux/in.h | 2 ++ tools/include/uapi/linux/in.h | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3ff12b90048d..a966d4b5ab37 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,8 @@ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ - EMe(IPPROTO_SCTP) + EM(IPPROTO_SCTP) \ + EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index e7ad9d350a28..1521073b6348 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -76,6 +76,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_MPTCP = 262, /* Multipath TCP connection */ +#define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX }; #endif diff --git a/tools/include/uapi/linux/in.h b/tools/include/uapi/linux/in.h index e7ad9d350a28..1521073b6348 100644 --- a/tools/include/uapi/linux/in.h +++ b/tools/include/uapi/linux/in.h @@ -76,6 +76,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW + IPPROTO_MPTCP = 262, /* Multipath TCP connection */ +#define IPPROTO_MPTCP IPPROTO_MPTCP IPPROTO_MAX }; #endif -- cgit From c74a39c861aeaf6b789b7abdbb3256f54c9fb365 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:17 -0800 Subject: tcp: Add MPTCP option number TCP option 30 is allocated for MPTCP by the IANA. Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7df37e2fddca..85f1d7ff6e8b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP -- cgit From 1323059301c8f36d933876233516245d882346a6 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:18 -0800 Subject: tcp, ulp: Add clone operation to tcp_ulp_ops If ULP is used on a listening socket, icsk_ulp_ops and icsk_ulp_data are copied when the listener is cloned. Sometimes the clone is immediately deleted, which will invoke the release op on the clone and likely corrupt the listening socket's icsk_ulp_data. The clone operation is invoked immediately after the clone is copied and gives the ULP type an opportunity to set up the clone socket and its icsk_ulp_data. The MPTCP ULP clone will silently fallback to plain TCP on allocation failure, so 'clone()' does not need to return an error code. v6 -> v7: - move and rename ulp clone helper to make it inline-friendly v5 -> v6: - clarified MPTCP clone usage in commit message Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +++ net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 85f1d7ff6e8b..ac52633e7061 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2154,6 +2154,9 @@ struct tcp_ulp_ops { /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); + /* clone ulp */ + void (*clone)(const struct request_sock *req, struct sock *newsk, + const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18c0d5bffe12..6a691fd04398 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -770,6 +770,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +822,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; -- cgit From 3ee17bc78e0f3fdeff9890993e8f3a9f5145163b Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:19 -0800 Subject: mptcp: Add MPTCP to skb extensions Add enum value for MPTCP and update config dependencies v5 -> v6: - fixed '__unused' field size Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- MAINTAINERS | 10 ++++++++++ include/linux/skbuff.h | 3 +++ include/net/mptcp.h | 28 ++++++++++++++++++++++++++++ net/core/skbuff.c | 7 +++++++ 4 files changed, 48 insertions(+) create mode 100644 include/net/mptcp.h diff --git a/MAINTAINERS b/MAINTAINERS index 218089f38e11..9dcb9cab5705 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11573,6 +11573,16 @@ F: net/ipv6/calipso.c F: net/netfilter/xt_CONNSECMARK.c F: net/netfilter/xt_SECMARK.c +NETWORKING [MPTCP] +M: Mat Martineau +M: Matthieu Baerts +L: netdev@vger.kernel.org +L: mptcp@lists.01.org +W: https://github.com/multipath-tcp/mptcp_net-next/wiki +B: https://github.com/multipath-tcp/mptcp_net-next/issues +S: Maintained +F: include/net/mptcp.h + NETWORKING [TCP] M: Eric Dumazet L: netdev@vger.kernel.org diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 64e5b1be9ff5..f5c27600b410 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4096,6 +4096,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, +#endif +#if IS_ENABLED(CONFIG_MPTCP) + SKB_EXT_MPTCP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/mptcp.h b/include/net/mptcp.h new file mode 100644 index 000000000000..326043c29c0a --- /dev/null +++ b/include/net/mptcp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Multipath TCP + * + * Copyright (c) 2017 - 2019, Intel Corporation. + */ + +#ifndef __NET_MPTCP_H +#define __NET_MPTCP_H + +#include + +/* MPTCP sk_buff extension data */ +struct mptcp_ext { + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + __unused:3; + /* one byte hole */ +}; + +#endif /* __NET_MPTCP_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44b0894d8ae1..a4106da23c34 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -4109,6 +4110,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), #endif +#if IS_ENABLED(CONFIG_MPTCP) + [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4122,6 +4126,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) skb_ext_type_len[TC_SKB_EXT] + +#endif +#if IS_ENABLED(CONFIG_MPTCP) + skb_ext_type_len[SKB_EXT_MPTCP] + #endif 0; } -- cgit From 85712484110df308215077be6ee21c4e57d7dec2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:20 -0800 Subject: tcp: coalesce/collapse must respect MPTCP extensions Coalesce and collapse of packets carrying MPTCP extensions is allowed when the newer packet has no extension or the extensions carried by both packets are equal. This allows merging of TSO packet trains and even cross-TSO packets, and does not require any additional action when moving data into existing SKBs. v3 -> v4: - allow collapsing, under mptcp_skb_can_collapse() constraint v5 -> v6: - clarify MPTCP skb extensions must always be cleared at allocation time Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 8 ++++++++ net/ipv4/tcp_input.c | 11 +++++++--- net/ipv4/tcp_output.c | 2 +- 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 326043c29c0a..0573ae75c3db 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -8,6 +8,7 @@ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H +#include #include /* MPTCP sk_buff extension data */ @@ -25,4 +26,60 @@ struct mptcp_ext { /* one byte hole */ }; +#ifdef CONFIG_MPTCP + +/* move the skb extension owership, with the assumption that 'to' is + * newly allocated + */ +static inline void mptcp_skb_ext_move(struct sk_buff *to, + struct sk_buff *from) +{ + if (!skb_ext_exist(from, SKB_EXT_MPTCP)) + return; + + if (WARN_ON_ONCE(to->active_extensions)) + skb_ext_put(to); + + to->active_extensions = from->active_extensions; + to->extensions = from->extensions; + from->active_extensions = 0; +} + +static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, + const struct mptcp_ext *from_ext) +{ + /* MPTCP always clears the ext when adding it to the skb, so + * holes do not bother us here + */ + return !from_ext || + (to_ext && from_ext && + !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); +} + +/* check if skbs can be collapsed. + * MPTCP collapse is allowed if neither @to or @from carry an mptcp data + * mapping, or if the extension of @to is the same as @from. + * Collapsing is not possible if @to lacks an extension, but @from carries one. + */ +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), + skb_ext_find(from, SKB_EXT_MPTCP)); +} + +#else + +static inline void mptcp_skb_ext_move(struct sk_buff *to, + const struct sk_buff *from) +{ +} + +static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return true; +} + +#endif /* CONFIG_MPTCP */ #endif /* __NET_MPTCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index ac52633e7061..13bc83fab454 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -978,6 +979,13 @@ static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) return likely(!TCP_SKB_CB(skb)->eor); } +static inline bool tcp_skb_can_collapse(const struct sk_buff *to, + const struct sk_buff *from) +{ + return likely(tcp_skb_can_collapse_to(to) && + mptcp_skb_can_collapse(to, from)); +} + /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 062b696a5fa1..2914fdf1d543 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1422,7 +1422,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -4423,6 +4423,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4932,7 +4935,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4941,7 +4944,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4974,6 +4977,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4993,6 +4997,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..3ce7fe1c4076 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2865,7 +2865,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; -- cgit From 35b2c32116091ef87a15c604cac363da8322a288 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:21 -0800 Subject: tcp: Export TCP functions and ops struct MPTCP will make use of tcp_send_mss() and tcp_push() when sending data to specific TCP subflows. tcp_request_sock_ipvX_ops and ipvX_specific will be referenced during TCP subflow creation. Co-developed-by: Peter Krystad Signed-off-by: Peter Krystad Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 6 +++--- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 13bc83fab454..5e4133d09b9d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -330,6 +330,9 @@ int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); +int tcp_send_mss(struct sock *sk, int *size_goal, int flags); +void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, + int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); @@ -2011,6 +2014,11 @@ struct tcp_request_sock_ops { enum tcp_synack_type synack_type); }; +extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; +#if IS_ENABLED(CONFIG_IPV6) +extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; +#endif + #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f09fbc85b108..6711a97de3ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -690,8 +690,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -925,7 +925,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4adac9c75343..fedb537839ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1426,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 95e4e1e95db2..5b5260103b65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -75,7 +75,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; -static const struct inet_connection_sock_af_ops ipv6_specific; +const struct inet_connection_sock_af_ops ipv6_specific; #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; @@ -819,7 +819,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG @@ -1794,7 +1794,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor = tcp_twsk_destructor, }; -static const struct inet_connection_sock_af_ops ipv6_specific = { +const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, -- cgit From 9cfcca2389d7e07647ee69950f46ab5e6dfe03ac Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 9 Jan 2020 07:59:22 -0800 Subject: tcp: Check for filled TCP option space before SACK Update the SACK check to work with zero option space available, a case that's possible with MPTCP but not MD5+TS. Maintained only one conditional branch for insufficient SACK space. v1 -> v2: - Moves the check inside the SACK branch by taking recent SACK fix: 9424e2e7ad93 (tcp: md5: fix potential overestimation of TCP option space) in to account, but modifies it to work in MPTCP scenarios beyond the MD5+TS corner case. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ce7fe1c4076..05109d0c675b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -754,13 +754,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - if (likely(opts->num_sack_blocks)) - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; -- cgit From e66b2f31a068dd67172008459678821a79e4ea24 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jan 2020 07:59:23 -0800 Subject: tcp: clean ext on tx recycle Otherwise we will find stray/unexpected/old extensions value on next iteration. On tcp_write_xmit() we can end-up splitting an already queued skb in two parts, via tso_fragment(). The newly created skb can be allocated via the tx cache and an upper layer will not be aware of it, so that upper layer cannot set the ext properly. Resetting the ext on recycle ensures that stale data is not propagated in to packet headers or elsewhere. An alternative would be add an additional hook in tso_fragment() or in sk_stream_alloc_skb() to init the ext for upper layers that need it. Co-developed-by: Florian Westphal Signed-off-by: Florian Westphal Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/sock.h b/include/net/sock.h index 8766f9bc3e70..432ff73d20f3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1464,6 +1464,7 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { + skb_ext_reset(skb); skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; -- cgit From 8b69a803814bb8b14155ea60df83f6d57527e69e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jan 2020 07:59:24 -0800 Subject: skb: add helpers to allocate ext independently from sk_buff Currently we can allocate the extension only after the skb, this change allows the user to do the opposite, will simplify allocation failure handling from MPTCP. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5c27600b410..016b3c4ab99a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4120,6 +4120,9 @@ struct skb_ext { char data[0] __aligned(8); }; +struct skb_ext *__skb_ext_alloc(void); +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a4106da23c34..48a7029529c9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5987,7 +5987,14 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); } -static struct skb_ext *skb_ext_alloc(void) +/** + * __skb_ext_alloc - allocate a new skb extensions storage + * + * Returns the newly allocated pointer. The pointer can later attached to a + * skb via __skb_ext_set(). + * Note: caller must handle the skb_ext as an opaque data. + */ +struct skb_ext *__skb_ext_alloc(void) { struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); @@ -6027,6 +6034,30 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, return new; } +/** + * __skb_ext_set - attach the specified extension storage to this skb + * @skb: buffer + * @id: extension id + * @ext: extension storage previously allocated via __skb_ext_alloc() + * + * Existing extensions, if any, are cleared. + * + * Returns the pointer to the extension. + */ +void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, + struct skb_ext *ext) +{ + unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); + + skb_ext_put(skb); + newlen = newoff + skb_ext_type_len[id]; + ext->chunks = newlen; + ext->offset[id] = newoff; + skb->extensions = ext; + skb->active_extensions = 1 << id; + return skb_ext_get_ptr(ext, id); +} + /** * skb_ext_add - allocate space for given extension, COW if needed * @skb: buffer @@ -6060,7 +6091,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = skb_ext_alloc(); + new = __skb_ext_alloc(); if (!new) return NULL; } -- cgit