diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 13 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 4 | ||||
-rw-r--r-- | net/ipv4/fou.c | 2 | ||||
-rw-r--r-- | net/ipv4/geneve.c | 3 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 4 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 11 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_vti.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 53 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 10 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_reject_ipv4.c | 91 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_masq_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/ping.c | 14 | ||||
-rw-r--r-- | net/ipv4/route.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 64 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp_memcontrol.c | 87 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 2 |
23 files changed, 247 insertions, 201 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 92db7a69f2b9..e67da4e6c324 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1246,7 +1246,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) - features = skb->dev->hw_enc_features & netif_skb_features(skb); + features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); @@ -1386,6 +1386,17 @@ out: return pp; } +int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +{ + if (sk->sk_family == AF_INET) + return ip_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); +#endif + return -EINVAL; +} + static int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2e15738534d..8f7bd56955b0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -62,6 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) else res->tclassid = 0; #endif + + if (err == -ESRCH) + err = -ENETUNREACH; + return err; } EXPORT_SYMBOL_GPL(__fib_lookup); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 32e78924e246..606c520ffd5a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -133,6 +133,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff) int err = -ENOSYS; const struct net_offload **offloads; + udp_tunnel_gro_complete(skb, nhoff); + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 065cd94c640c..dedb21e99914 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -144,6 +144,8 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, tos, ttl, df, src_port, dst_port, xnet); } @@ -364,6 +366,7 @@ late_initcall(geneve_init_module); static void __exit geneve_cleanup_module(void) { destroy_workqueue(geneve_wq); + unregister_pernet_subsys(&geneve_net_ops); } module_exit(geneve_cleanup_module); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ccda09628de7..bb5947b0ce2d 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -47,7 +47,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); - ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); if (unlikely(ghl < sizeof(*greh))) goto out; @@ -68,7 +68,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + enc_features = skb->dev->hw_enc_features & features; segs = skb_mac_gso_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fb70e3ecc3e4..bb15d0e03d4f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } -#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) - -static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; @@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, @@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } skb->priority = TC_PRIO_CONTROL; - igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, @@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); skb_reset_network_header(skb); @@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 9eb89f3f0ee4..19419b60cb37 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -146,7 +146,6 @@ evict_again: atomic_inc(&fq->refcnt); spin_unlock(&hb->chain_lock); del_timer_sync(&fq->timer); - WARN_ON(atomic_read(&fq->refcnt) != 1); inet_frag_put(fq, f); goto evict_again; } @@ -285,7 +284,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) struct inet_frag_bucket *hb; hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); + if (!(fq->flags & INET_FRAG_EVICTED)) + hlist_del(&fq->list); spin_unlock(&hb->chain_lock); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 88e5ef2c7f51..bc6471d4abcd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -231,7 +231,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) */ features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR(segs)) { + if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9ad4555..9daf2177dc00 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -195,7 +195,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; -#if defined(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 3e861011e4a3..1a7e979e80ba 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -528,6 +528,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, + .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index a054fe083431..5c61328b7704 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -56,11 +56,11 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static int ipv4_print_tuple(struct seq_file *s, +static void ipv4_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "src=%pI4 dst=%pI4 ", - &tuple->src.u3.ip, &tuple->dst.u3.ip); + seq_printf(s, "src=%pI4 dst=%pI4 ", + &tuple->src.u3.ip, &tuple->dst.u3.ip); } static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 4c48e434bb1f..a460a87e14f8 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -94,7 +94,7 @@ static void ct_seq_stop(struct seq_file *s, void *v) } #ifdef CONFIG_NF_CONNTRACK_SECMARK -static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { int ret; u32 len; @@ -102,17 +102,15 @@ static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) ret = security_secid_to_secctx(ct->secmark, &secctx, &len); if (ret) - return 0; + return; - ret = seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", secctx); security_release_secctx(secctx, len); - return ret; } #else -static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { - return 0; } #endif @@ -141,47 +139,52 @@ static int ct_seq_show(struct seq_file *s, void *v) NF_CT_ASSERT(l4proto); ret = -ENOSPC; - if (seq_printf(s, "%-8s %u %ld ", - l4proto->name, nf_ct_protonum(ct), - timer_pending(&ct->timeout) - ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) - goto release; + seq_printf(s, "%-8s %u %ld ", + l4proto->name, nf_ct_protonum(ct), + timer_pending(&ct->timeout) + ? (long)(ct->timeout.expires - jiffies)/HZ : 0); + + if (l4proto->print_conntrack) + l4proto->print_conntrack(s, ct); - if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) + if (seq_has_overflowed(s)) goto release; - if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto)) + print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, l4proto); + + if (seq_has_overflowed(s)) goto release; if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) goto release; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) - if (seq_printf(s, "[UNREPLIED] ")) - goto release; + seq_printf(s, "[UNREPLIED] "); - if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto)) + print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, l4proto); + + if (seq_has_overflowed(s)) goto release; if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; if (test_bit(IPS_ASSURED_BIT, &ct->status)) - if (seq_printf(s, "[ASSURED] ")) - goto release; + seq_printf(s, "[ASSURED] "); #ifdef CONFIG_NF_CONNTRACK_MARK - if (seq_printf(s, "mark=%u ", ct->mark)) - goto release; + seq_printf(s, "mark=%u ", ct->mark); #endif - if (ct_show_secctx(s, ct)) - goto release; + ct_show_secctx(s, ct); - if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); + + if (seq_has_overflowed(s)) goto release; + ret = 0; release: nf_ct_put(ct); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index b91b2641adda..80d5554b9a88 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -72,13 +72,13 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, } /* Print out the per-protocol part of the tuple. */ -static int icmp_print_tuple(struct seq_file *s, +static void icmp_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { - return seq_printf(s, "type=%u code=%u id=%u ", - tuple->dst.u.icmp.type, - tuple->dst.u.icmp.code, - ntohs(tuple->src.u.icmp.id)); + seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); } static unsigned int *icmp_get_timeouts(struct net *net) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index b023b4eb1a96..1baaa83dfe5c 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -6,48 +6,45 @@ * published by the Free Software Foundation. */ +#include <linux/module.h> #include <net/ip.h> #include <net/tcp.h> #include <net/route.h> #include <net/dst.h> #include <linux/netfilter_ipv4.h> +#include <net/netfilter/ipv4/nf_reject.h> -/* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; + return NULL; oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); + sizeof(struct tcphdr), _oth); if (oth == NULL) - return; + return NULL; /* No RST for RST. */ if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; + return NULL; /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); + return NULL; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; + return oth; +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); - skb_reserve(nskb, LL_MAX_HEADER); +struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int ttl) +{ + struct iphdr *niph, *oiph = ip_hdr(oldskb); skb_reset_network_header(nskb); niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); @@ -56,10 +53,23 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) niph->tos = 0; niph->id = 0; niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; + niph->protocol = protocol; niph->check = 0; niph->saddr = oiph->daddr; niph->daddr = oiph->saddr; + niph->ttl = ttl; + + nskb->protocol = htons(ETH_P_IP); + + return niph; +} +EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); + +void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) +{ + struct iphdr *niph = ip_hdr(nskb); + struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); @@ -68,9 +78,9 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) tcph->dest = oth->source; tcph->doff = sizeof(struct tcphdr) / 4; - if (oth->ack) + if (oth->ack) { tcph->seq = oth->ack_seq; - else { + } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); @@ -83,16 +93,43 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _oth; + + oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); + if (!oth) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); - nskb->protocol = htons(ETH_P_IP); + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + ip4_dst_hoplimit(skb_dst(nskb))); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) goto free_nskb; - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; @@ -125,3 +162,5 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) kfree_skb(nskb); } EXPORT_SYMBOL_GPL(nf_send_reset); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 1c636d6b5b50..665de06561cd 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -24,6 +24,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, struct nf_nat_range range; unsigned int verdict; + memset(&range, 0, sizeof(range)); range.flags = priv->flags; verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, @@ -39,6 +40,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .eval = nft_masq_ipv4_eval, .init = nft_masq_init, .dump = nft_masq_dump, + .validate = nft_masq_validate, }; static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c9804139..5d740cccf69e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -217,6 +217,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) &ipv6_hdr(skb)->daddr)) continue; #endif + } else { + continue; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) @@ -853,16 +855,8 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; - if (flags & MSG_ERRQUEUE) { - if (family == AF_INET) { - return ip_recv_error(sk, msg, len, addr_len); -#if IS_ENABLED(CONFIG_IPV6) - } else if (family == AF_INET6) { - return pingv6_ops.ipv6_recv_error(sk, msg, len, - addr_len); -#endif - } - } + if (flags & MSG_ERRQUEUE) + return inet_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2d4ae469b471..6a2155b02602 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1798,6 +1798,7 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; + res.fi = NULL; goto local_input; /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1bec4e76d88c..38c2bcb8dd5d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1598,7 +1598,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u32 urg_hole = 0; if (unlikely(flags & MSG_ERRQUEUE)) - return ip_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) @@ -2868,61 +2868,42 @@ EXPORT_SYMBOL(compat_tcp_getsockopt); #endif #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); - -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); - - if (p->md5_desc.tfm) - crypto_free_hash(p->md5_desc.tfm); - } - free_percpu(pool); -} +static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { int cpu; - struct tcp_md5sig_pool __percpu *pool; - - pool = alloc_percpu(struct tcp_md5sig_pool); - if (!pool) - return; for_each_possible_cpu(cpu) { - struct crypto_hash *hash; - - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) - goto out_free; + if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { + struct crypto_hash *hash; - per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; + hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(hash)) + return; + per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; + } } - /* before setting tcp_md5sig_pool, we must commit all writes - * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() + /* before setting tcp_md5sig_pool_populated, we must commit all writes + * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); - tcp_md5sig_pool = pool; - return; -out_free: - __tcp_free_md5sig_pool(pool); + tcp_md5sig_pool_populated = true; } bool tcp_alloc_md5sig_pool(void) { - if (unlikely(!tcp_md5sig_pool)) { + if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool) + if (!tcp_md5sig_pool_populated) __tcp_alloc_md5sig_pool(); mutex_unlock(&tcp_md5sig_mutex); } - return tcp_md5sig_pool != NULL; + return tcp_md5sig_pool_populated; } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -2936,13 +2917,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { - struct tcp_md5sig_pool __percpu *p; - local_bh_disable(); - p = ACCESS_ONCE(tcp_md5sig_pool); - if (p) - return raw_cpu_ptr(p); + if (tcp_md5sig_pool_populated) { + /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ + smp_rmb(); + return this_cpu_ptr(&tcp_md5sig_pool); + } local_bh_enable(); return NULL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e5..d107ee246a1d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2315,6 +2315,35 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) /* Undo procedures. */ +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static bool tcp_any_retrans_done(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return true; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return true; + + return false; +} + #if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { @@ -2410,6 +2439,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; return true; } tcp_set_ca_state(sk, TCP_CA_Open); @@ -2430,35 +2461,6 @@ static bool tcp_try_undo_dsack(struct sock *sk) return false; } -/* We can clear retrans_stamp when there are no retransmissions in the - * window. It would seem that it is trivially available for us in - * tp->retrans_out, however, that kind of assumptions doesn't consider - * what will happen if errors occur when sending retransmission for the - * second time. ...It could the that such segment has only - * TCPCB_EVER_RETRANS set at the present time. It seems that checking - * the head skb is enough except for some reneging corner cases that - * are not worth the effort. - * - * Main reason for all this complexity is the fact that connection dying - * time now depends on the validity of the retrans_stamp, in particular, - * that successive retransmissions of a segment must not advance - * retrans_stamp under any conditions. - */ -static bool tcp_any_retrans_done(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (tp->retrans_out) - return true; - - skb = tcp_write_queue_head(sk); - if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) - return true; - - return false; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { @@ -5229,7 +5231,7 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; /* @@ -5648,7 +5650,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; if (!tcp_validate_incoming(sk, skb, th, 0)) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 94d1a7757ff7..147be2024290 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -206,8 +206,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; - inet_set_txhash(sk); - inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -224,6 +222,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; + inet_set_txhash(sk); + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { @@ -598,7 +598,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (skb_rtable(skb)->rt_type != RTN_LOCAL) + /* If sk not NULL, it means we did a successful lookup and incoming + * route had to be correct. prequeue might have dropped our dst. + */ + if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 1d191357bf88..272327134a1b 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -9,13 +9,13 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { /* - * The root cgroup does not use res_counters, but rather, + * The root cgroup does not use page_counters, but rather, * rely on the data already collected by the network * subsystem */ - struct res_counter *res_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct page_counter *counter_parent = NULL; + struct cg_proto *cg_proto, *parent_cg; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) @@ -29,9 +29,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) parent_cg = tcp_prot.proto_cgroup(parent); if (parent_cg) - res_parent = &parent_cg->memory_allocated; + counter_parent = &parent_cg->memory_allocated; - res_counter_init(&cg_proto->memory_allocated, res_parent); + page_counter_init(&cg_proto->memory_allocated, counter_parent); percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; @@ -50,7 +50,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_destroy_cgroup); -static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) +static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { struct cg_proto *cg_proto; int i; @@ -60,20 +60,17 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (!cg_proto) return -EINVAL; - if (val > RES_COUNTER_MAX) - val = RES_COUNTER_MAX; - - ret = res_counter_set_limit(&cg_proto->memory_allocated, val); + ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); if (ret) return ret; for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT, + cg_proto->sysctl_mem[i] = min_t(long, nr_pages, sysctl_tcp_mem[i]); - if (val == RES_COUNTER_MAX) + if (nr_pages == PAGE_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else if (val != RES_COUNTER_MAX) { + else { /* * The active bit needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -102,11 +99,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, +}; + +static DEFINE_MUTEX(tcp_limit_mutex); + static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long long val; + unsigned long nr_pages; int ret = 0; buf = strstrip(buf); @@ -114,10 +120,12 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, switch (of_cft(of)->private) { case RES_LIMIT: /* see memcontrol.c */ - ret = res_counter_memparse_write_strategy(buf, &val); + ret = page_counter_memparse(buf, &nr_pages); if (ret) break; - ret = tcp_update_limit(memcg, val); + mutex_lock(&tcp_limit_mutex); + ret = tcp_update_limit(memcg, nr_pages); + mutex_unlock(&tcp_limit_mutex); break; default: ret = -EINVAL; @@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, return ret ?: nbytes; } -static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) -{ - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return default_val; - - return res_counter_read_u64(&cg_proto->memory_allocated, type); -} - -static u64 tcp_read_usage(struct mem_cgroup *memcg) -{ - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) - return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT; - - return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE); -} - static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); u64 val; switch (cft->private) { case RES_LIMIT: - val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); + if (!cg_proto) + return PAGE_COUNTER_MAX; + val = cg_proto->memory_allocated.limit; + val *= PAGE_SIZE; break; case RES_USAGE: - val = tcp_read_usage(memcg); + if (!cg_proto) + val = atomic_long_read(&tcp_memory_allocated); + else + val = page_counter_read(&cg_proto->memory_allocated); + val *= PAGE_SIZE; break; case RES_FAILCNT: + if (!cg_proto) + return 0; + val = cg_proto->memory_allocated.failcnt; + break; case RES_MAX_USAGE: - val = tcp_read_stat(memcg, cft->private, 0); + if (!cg_proto) + return 0; + val = cg_proto->memory_allocated.watermark; + val *= PAGE_SIZE; break; default: BUG(); @@ -183,10 +184,10 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, switch (of_cft(of)->private) { case RES_MAX_USAGE: - res_counter_reset_max(&cg_proto->memory_allocated); + page_counter_reset_watermark(&cg_proto->memory_allocated); break; case RES_FAILCNT: - res_counter_reset_failcnt(&cg_proto->memory_allocated); + cg_proto->memory_allocated.failcnt = 0; break; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..a3d453b94747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 507310ef4b56..6480cea7aa53 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + enc_features = skb->dev->hw_enc_features & features; segs = gso_inner_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, |