diff options
author | Jakub Kicinski <kuba@kernel.org> | 2023-02-01 20:54:29 -0800 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2023-02-01 20:54:30 -0800 |
commit | 983f507c3043e90b2c6429cd67903c4ca8208b5c (patch) | |
tree | 6a0ceb380c64d02b978f47debe65aef168d76d9b | |
parent | d8673afbf51036ed1d72d9828d0d679035bb0d54 (diff) | |
parent | b1a78b9b98862cda167b643690e43662ea060625 (diff) |
Merge branch 'net-support-ipv4-big-tcp'
Xin Long says:
====================
net: support ipv4 big tcp
This is similar to the BIG TCP patchset added by Eric for IPv6:
https://lwn.net/Articles/895398/
Different from IPv6, IPv4 tot_len is 16-bit long only, and IPv4 header
doesn't have exthdrs(options) for the BIG TCP packets' length. To make
it simple, as David and Paolo suggested, we set IPv4 tot_len to 0 to
indicate this might be a BIG TCP packet and use skb->len as the real
IPv4 total length.
This will work safely, as all BIG TCP packets are GSO/GRO packets and
processed on the same host as they were created; There is no padding
in GSO/GRO packets, and skb->len - network_offset is exactly the IPv4
packet total length; Also, before implementing the feature, all those
places that may get iph tot_len from BIG TCP packets are taken care
with some new APIs:
Patch 1 adds some APIs for iph tot_len setting and getting, which are
used in all these places where IPv4 BIG TCP packets may reach in Patch
2-7, Patch 8 adds a GSO_TCP tp_status for af_packet users, and Patch 9
add new netlink attributes to make IPv4 BIG TCP independent from IPv6
BIG TCP on configuration, and Patch 10 implements this feature.
Note that the similar change as in Patch 2-6 are also needed for IPv6
BIG TCP packets, and will be addressed in another patchset.
The similar performance test is done for IPv4 BIG TCP with 25Gbit NIC
and 1.5K MTU:
No BIG TCP:
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
168 322 337 3776.49
143 236 277 4654.67
128 258 288 4772.83
171 229 278 4645.77
175 228 243 4678.93
149 239 279 4599.86
164 234 268 4606.94
155 276 289 4235.82
180 255 268 4418.95
168 241 249 4417.82
Enable BIG TCP:
ip link set dev ens1f0np0 gro_ipv4_max_size 128000 gso_ipv4_max_size 128000
for i in {1..10}; do netperf -t TCP_RR -H 192.168.100.1 -- -r80000,80000 -O MIN_LATENCY,P90_LATENCY,P99_LATENCY,THROUGHPUT|tail -1; done
161 241 252 4821.73
174 205 217 5098.28
167 208 220 5001.43
164 228 249 4883.98
150 233 249 4914.90
180 233 244 4819.66
154 208 219 5004.92
157 209 247 4999.78
160 218 246 4842.31
174 206 217 5080.99
Thanks for the feedback from Eric and David Ahern.
====================
Link: https://lore.kernel.org/r/cover.1674921359.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | drivers/net/ipvlan/ipvlan_core.c | 2 | ||||
-rw-r--r-- | include/linux/ip.h | 21 | ||||
-rw-r--r-- | include/linux/netdevice.h | 6 | ||||
-rw-r--r-- | include/net/netfilter/nf_tables_ipv4.h | 4 | ||||
-rw-r--r-- | include/net/route.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/if_packet.h | 1 | ||||
-rw-r--r-- | net/bridge/br_netfilter_hooks.c | 2 | ||||
-rw-r--r-- | net/bridge/netfilter/nf_conntrack_bridge.c | 4 | ||||
-rw-r--r-- | net/core/dev.c | 4 | ||||
-rw-r--r-- | net/core/dev.h | 18 | ||||
-rw-r--r-- | net/core/gro.c | 12 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 33 | ||||
-rw-r--r-- | net/core/sock.c | 26 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 7 | ||||
-rw-r--r-- | net/ipv4/cipso_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 2 | ||||
-rw-r--r-- | net/netfilter/nf_log_syslog.c | 2 | ||||
-rw-r--r-- | net/netfilter/xt_length.c | 2 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 2 | ||||
-rw-r--r-- | net/packet/af_packet.c | 4 | ||||
-rw-r--r-- | net/sched/act_ct.c | 2 | ||||
-rw-r--r-- | net/sched/sch_cake.c | 2 |
25 files changed, 130 insertions, 38 deletions
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index bb1c298c1e78..460b3d4f2245 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -157,7 +157,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) return NULL; ip4h = ip_hdr(skb); - pktlen = ntohs(ip4h->tot_len); + pktlen = skb_ip_totlen(skb); if (ip4h->ihl < 5 || ip4h->version != 4) return NULL; if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) diff --git a/include/linux/ip.h b/include/linux/ip.h index 3d9c6750af62..d11c25f5030a 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -35,4 +35,25 @@ static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } + +static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) +{ + u32 len = ntohs(iph->tot_len); + + return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? + len : skb->len - skb_network_offset(skb); +} + +static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) +{ + return iph_totlen(skb, ip_hdr(skb)); +} + +/* IPv4 datagram length is stored into 16bit field (tot_len) */ +#define IP_MAX_MTU 0xFFFFU + +static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) +{ + iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; +} #endif /* _LINUX_IP_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2466afa25078..d5ef4c1fedd2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1964,6 +1964,8 @@ enum netdev_ml_priv_type { * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO * @tso_max_segs: Device (as in HW) limit on the max TSO segment count + * @gso_ipv4_max_size: Maximum size of generic segmentation offload, + * for IPv4. * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -2004,6 +2006,8 @@ enum netdev_ml_priv_type { * keep a list of interfaces to be deleted. * @gro_max_size: Maximum size of aggregated packet in generic * receive offload (GRO) + * @gro_ipv4_max_size: Maximum size of aggregated packet in generic + * receive offload (GRO), for IPv4. * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2207,6 +2211,7 @@ struct net_device { */ #define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; + unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -2330,6 +2335,7 @@ struct net_device { u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; + unsigned int gso_ipv4_max_size; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index 112708f7a6b4..947973623dc7 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) if (iph->ihl < 5 || iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) return -1; @@ -64,7 +64,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/include/net/route.h b/include/net/route.h index 6e92dd5bcd61..fe00b0a2e475 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -35,9 +35,6 @@ #include <linux/cache.h> #include <linux/security.h> -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#define IP_MAX_MTU 0xFFFFU - #define RTO_ONLINK 0x01 #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1021a7e47a86..02b87e4c65be 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -374,6 +374,9 @@ enum { IFLA_DEVLINK_PORT, + IFLA_GSO_IPV4_MAX_SIZE, + IFLA_GRO_IPV4_MAX_SIZE, + __IFLA_MAX }; diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index a8516b3594a4..78c981d6a9d4 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -115,6 +115,7 @@ struct tpacket_auxdata { #define TP_STATUS_BLK_TMO (1 << 5) #define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ #define TP_STATUS_CSUM_VALID (1 << 7) +#define TP_STATUS_GSO_TCP (1 << 8) /* Tx ring - header status */ #define TP_STATUS_AVAILABLE 0 diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index f20f4373ff40..b67c9c98effa 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -214,7 +214,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < len) { __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 5c5dd437f1c2..71056ee84773 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -212,7 +212,7 @@ static int nf_ct_br_ip_check(const struct sk_buff *skb) iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < nhoff + len || len < (iph->ihl * 4)) return -1; @@ -256,7 +256,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct iphdr))) return NF_ACCEPT; - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); if (pskb_trim_rcsum(skb, len)) return NF_ACCEPT; diff --git a/net/core/dev.c b/net/core/dev.c index f72f5c4ee7e2..bb42150a38ec 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3001,6 +3001,8 @@ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); + if (size < READ_ONCE(dev->gso_ipv4_max_size)) + netif_set_gso_ipv4_max_size(dev, size); } EXPORT_SYMBOL(netif_set_tso_max_size); @@ -10614,6 +10616,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; + dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; diff --git a/net/core/dev.h b/net/core/dev.h index 814ed5b7b960..a065b7571441 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -100,6 +100,8 @@ static inline void netif_set_gso_max_size(struct net_device *dev, { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); + if (size <= GSO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, @@ -114,6 +116,22 @@ static inline void netif_set_gro_max_size(struct net_device *dev, { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); + if (size <= GRO_LEGACY_MAX_SIZE) + WRITE_ONCE(dev->gro_ipv4_max_size, size); +} + +static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_ipv4_max_size, size); +} + +static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_ipv4_max_size, size); } #endif diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..b15f85546bdd 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ - gro_max_size = READ_ONCE(p->dev->gro_max_size); + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(p->dev->gro_max_size) : + READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { - if (p->protocol != htons(ETH_P_IPV6) || - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64289bc98887..b9f584955b77 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1074,6 +1074,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ @@ -1807,6 +1809,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || + nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS @@ -1968,6 +1972,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, + [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2883,6 +2889,29 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); + + if (max_size > dev->tso_max_size) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_ipv4_max_size ^ max_size) { + netif_set_gso_ipv4_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); + + if (dev->gro_ipv4_max_size ^ gro_max_size) { + netif_set_gro_ipv4_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -3325,6 +3354,10 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); + if (tb[IFLA_GSO_IPV4_MAX_SIZE]) + netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); + if (tb[IFLA_GRO_IPV4_MAX_SIZE]) + netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } diff --git a/net/core/sock.c b/net/core/sock.c index 7ba4891460ad..f08b76acde9b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); -static void sk_trim_gso_size(struct sock *sk) +static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) - return; + bool is_ipv6 = false; + u32 max_size; + #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && - sk_is_tcp(sk) && - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return; + is_ipv6 = (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif - sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : + READ_ONCE(dst->dev->gso_ipv4_max_size); + if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) + max_size = GSO_LEGACY_MAX_SIZE; + + return max_size - (MAX_TCP_HEADER + 1); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst) @@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); - sk_trim_gso_size(sk); - sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6c0ec2789943..2f992a323b95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; + NAPI_GRO_CB(skb)->proto = proto; id = ntohl(*(__be32 *)&iph->id); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; + __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) skb_set_inner_network_header(skb, nhoff); } - csum_replace2(&iph->check, iph->tot_len, newlen); - iph->tot_len = newlen; + iph_set_totlen(iph, skb->len - nhoff); + csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6cd3b6c559f0..79ae7204e8ed 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2222,7 +2222,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); } ip_send_check(iph); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e880ce77322a..fe9ead9ee863 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 922c87ef1ab5..4e4e308c3230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 029171379884..80448885c3d7 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index cb894f0d63e9..c66689ad2b49 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 1873da3a945a..b3d623a52885 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c8b137649ca4..2172930b1f17 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1103,7 +1103,7 @@ static int ovs_skb_network_trim(struct sk_buff *skb) switch (skb->protocol) { case htons(ETH_P_IP): - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); break; case htons(ETH_P_IPV6): len = sizeof(struct ipv6hdr) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5ab98ca2511..8ffb19c643ab 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2296,6 +2296,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; @@ -3522,6 +3524,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0ca2bb8ed026..d68bb5dbf0dc 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -707,7 +707,7 @@ static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) switch (family) { case NFPROTO_IPV4: - len = ntohs(ip_hdr(skb)->tot_len); + len = skb_ip_totlen(skb); break; case NFPROTO_IPV6: len = sizeof(struct ipv6hdr) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 3ed0c3342189..7970217b565a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, iph_check->daddr != iph->daddr) continue; - seglen = ntohs(iph_check->tot_len) - + seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; |