From ab2d7251d666995740da17b2a51ca545ac5dd037 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Oct 2014 11:25:20 +0200 Subject: netfilter: missing module license in the nf_reject_ipvX modules [ 23.545204] nf_reject_ipv4: module license 'unspecified' taints kernel. Fixes: c8d7b98 ("netfilter: move nf_send_resetX() code to nf_reject_ipvX modules") Reported-by: Dave Young Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_reject_ipv4.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index b023b4eb1a96..92b303dbd5fc 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -125,3 +126,5 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) kfree_skb(nskb); } EXPORT_SYMBOL_GPL(nf_send_reset); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3-73-gaa49b From 7210e4e38f945dfa173c4a4e59ad827c9ecad541 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Oct 2014 19:50:22 +0200 Subject: netfilter: nf_tables: restrict nat/masq expressions to nat chain type This adds the missing validation code to avoid the use of nat/masq from non-nat chains. The validation assumes two possible configuration scenarios: 1) Use of nat from base chain that is not of nat type. Reject this configuration from the nft_*_init() path of the expression. 2) Use of nat from non-base chain. In this case, we have to wait until the non-base chain is referenced by at least one base chain via jump/goto. This is resolved from the nft_*_validate() path which is called from nf_tables_check_loops(). The user gets an -EOPNOTSUPP in both cases. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/net/netfilter/nft_masq.h | 3 +++ net/ipv4/netfilter/nft_masq_ipv4.c | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 1 + net/netfilter/nf_tables_api.c | 14 ++++++++++++++ net/netfilter/nft_masq.c | 12 ++++++++++++ net/netfilter/nft_nat.c | 12 ++++++++++++ 7 files changed, 46 insertions(+) (limited to 'net/ipv4') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3d7292392fac..845c596bf594 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -530,6 +530,9 @@ enum nft_chain_type { NFT_CHAIN_T_MAX }; +int nft_chain_validate_dependency(const struct nft_chain *chain, + enum nft_chain_type type); + struct nft_stats { u64 bytes; u64 pkts; diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h index c72729f954f4..e2a518b60e19 100644 --- a/include/net/netfilter/nft_masq.h +++ b/include/net/netfilter/nft_masq.h @@ -13,4 +13,7 @@ int nft_masq_init(const struct nft_ctx *ctx, int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + #endif /* _NFT_MASQ_H_ */ diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index 1c636d6b5b50..c1023c445920 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = { .eval = nft_masq_ipv4_eval, .init = nft_masq_init, .dump = nft_masq_dump, + .validate = nft_masq_validate, }; static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 556262f40761..8a7ac685076d 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = { .eval = nft_masq_ipv6_eval, .init = nft_masq_init, .dump = nft_masq_dump, + .validate = nft_masq_validate, }; static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 556a0dfa4abc..65eb2a1160d5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3744,6 +3744,20 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .abort = nf_tables_abort, }; +int nft_chain_validate_dependency(const struct nft_chain *chain, + enum nft_chain_type type) +{ + const struct nft_base_chain *basechain; + + if (chain->flags & NFT_BASE_CHAIN) { + basechain = nft_base_chain(chain); + if (basechain->type->type != type) + return -EOPNOTSUPP; + } + return 0; +} +EXPORT_SYMBOL_GPL(nft_chain_validate_dependency); + /* * Loop detection - walk through the ruleset beginning at the destination chain * of a new jump until either the source chain is reached (loop) or all diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 6637bab00567..d1ffd5eb3a9b 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -26,6 +26,11 @@ int nft_masq_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_masq *priv = nft_expr_priv(expr); + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; if (tb[NFTA_MASQ_FLAGS] == NULL) return 0; @@ -55,5 +60,12 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_masq_dump); +int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} +EXPORT_SYMBOL_GPL(nft_masq_validate); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 799550b476fb..0f0af6e86fb8 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -95,6 +95,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, u32 family; int err; + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + if (tb[NFTA_NAT_TYPE] == NULL) return -EINVAL; @@ -205,6 +209,13 @@ nla_put_failure: return -1; } +static int nft_nat_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} + static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, @@ -212,6 +223,7 @@ static const struct nft_expr_ops nft_nat_ops = { .eval = nft_nat_eval, .init = nft_nat_init, .dump = nft_nat_dump, + .validate = nft_nat_validate, }; static struct nft_expr_type nft_nat_type __read_mostly = { -- cgit v1.2.3-73-gaa49b From 1e16aa3ddf863c6b9f37eddf52503230a62dedb3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 Oct 2014 13:49:16 +0200 Subject: net: gso: use feature flag argument in all protocol gso handlers skb_gso_segment() has a 'features' argument representing offload features available to the output path. A few handlers, e.g. GRE, instead re-fetch the features of skb->dev and use those instead of the provided ones when handing encapsulation/tunnels. Depending on dev->hw_enc_features of the output device skb_gso_segment() can then return NULL even when the caller has disabled all GSO feature bits, as segmentation of inner header thinks device will take care of segmentation. This e.g. affects the tbf scheduler, which will silently drop GRE-encap GSO skbs that did not fit the remaining token quota as the segmentation does not work when device supports corresponding hw offload capabilities. Cc: Pravin B Shelar Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/ip6_offload.c | 2 +- net/mpls/mpls_gso.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 92db7a69f2b9..8b7fe5b03906 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1246,7 +1246,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) - features = skb->dev->hw_enc_features & netif_skb_features(skb); + features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ccda09628de7..f6e345c0bc23 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -68,7 +68,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb->mac_len = skb_inner_network_offset(skb); /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + enc_features = skb->dev->hw_enc_features & features; segs = skb_mac_gso_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 507310ef4b56..6480cea7aa53 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->encap_hdr_csum = 1; /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); + enc_features = skb->dev->hw_enc_features & features; segs = gso_inner_segment(skb, enc_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 91014d32488d..a071563a7e6e 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -90,7 +90,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) - features = skb->dev->hw_enc_features & netif_skb_features(skb); + features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index e28ed2ef5b06..f0f5309a2d72 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -48,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, __skb_push(skb, skb->mac_len); /* Segment inner packet. */ - mpls_features = skb->dev->mpls_features & netif_skb_features(skb); + mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); -- cgit v1.2.3-73-gaa49b From 330966e501ffe282d7184fde4518d5e0c24bc7f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 Oct 2014 13:49:17 +0200 Subject: net: make skb_gso_segment error handling more robust skb_gso_segment has three possible return values: 1. a pointer to the first segmented skb 2. an errno value (IS_ERR()) 3. NULL. This can happen when GSO is used for header verification. However, several callers currently test IS_ERR instead of IS_ERR_OR_NULL and would oops when NULL is returned. Note that these call sites should never actually see such a NULL return value; all callers mask out the GSO bits in the feature argument. However, there have been issues with some protocol handlers erronously not respecting the specified feature mask in some cases. It is preferable to get 'have to turn off hw offloading, else slow' reports rather than 'kernel crashes'. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- net/netfilter/nfnetlink_queue_core.c | 2 +- net/openvswitch/datapath.c | 2 ++ net/xfrm/xfrm_output.c | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 88e5ef2c7f51..bc6471d4abcd 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -231,7 +231,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) */ features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR(segs)) { + if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index a82077d9f59b..7c60ccd61a3e 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -665,7 +665,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) * returned by nf_queue. For instance, callers rely on -ECANCELED to * mean 'ignore this hook'. */ - if (IS_ERR(segs)) + if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 2e31d9e7f4dc..e6d7255183eb 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -324,6 +324,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); + if (segs == NULL) + return -EINVAL; /* Queue all of the segments. */ skb = segs; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 499d6c18a8ce..7c532856b398 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -157,6 +157,8 @@ static int xfrm_output_gso(struct sk_buff *skb) kfree_skb(skb); if (IS_ERR(segs)) return PTR_ERR(segs); + if (segs == NULL) + return -EINVAL; do { struct sk_buff *nskb = segs->next; -- cgit v1.2.3-73-gaa49b From 9e7ceb060754f134231f68cb29d5db31419fe1ed Mon Sep 17 00:00:00 2001 From: Sathya Perla Date: Wed, 22 Oct 2014 21:42:01 +0530 Subject: net: fix saving TX flow hash in sock for outgoing connections The commit "net: Save TX flow hash in sock and set in skbuf on xmit" introduced the inet_set_txhash() and ip6_set_txhash() routines to calculate and record flow hash(sk_txhash) in the socket structure. sk_txhash is used to set skb->hash which is used to spread flows across multiple TXQs. But, the above routines are invoked before the source port of the connection is created. Because of this all outgoing connections that just differ in the source port get hashed into the same TXQ. This patch fixes this problem for IPv4/6 by invoking the the above routines after the source port is available for the socket. Fixes: b73c3d0e4("net: Save TX flow hash in sock and set in skbuf on xmit") Signed-off-by: Sathya Perla Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 94d1a7757ff7..9c7d7621466b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -206,8 +206,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; - inet_set_txhash(sk); - inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; @@ -224,6 +222,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; + inet_set_txhash(sk); + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 831495529b82..ace29b60813c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -200,8 +200,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; - ip6_set_txhash(sk); - /* * TCP over IPv4 */ @@ -297,6 +295,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) goto late_failure; + ip6_set_txhash(sk); + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, -- cgit v1.2.3-73-gaa49b From 349ce993ac706869d553a1816426d3a4bfda02b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Oct 2014 12:58:58 -0700 Subject: tcp: md5: do not use alloc_percpu() percpu tcp_md5sig_pool contains memory blobs that ultimately go through sg_set_buf(). -> sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); This requires that whole area is in a physically contiguous portion of memory. And that @buf is not backed by vmalloc(). Given that alloc_percpu() can use vmalloc() areas, this does not fit the requirements. Replace alloc_percpu() by a static DEFINE_PER_CPU() as tcp_md5sig_pool is small anyway, there is no gain to dynamically allocate it. Signed-off-by: Eric Dumazet Fixes: 765cf9976e93 ("tcp: md5: remove one indirection level in tcp_md5sig_pool") Reported-by: Crestez Dan Leonard Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 59 ++++++++++++++++++++-------------------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1bec4e76d88c..39ec0c379545 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2868,61 +2868,42 @@ EXPORT_SYMBOL(compat_tcp_getsockopt); #endif #ifdef CONFIG_TCP_MD5SIG -static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly; +static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool); static DEFINE_MUTEX(tcp_md5sig_mutex); - -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); - - if (p->md5_desc.tfm) - crypto_free_hash(p->md5_desc.tfm); - } - free_percpu(pool); -} +static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { int cpu; - struct tcp_md5sig_pool __percpu *pool; - - pool = alloc_percpu(struct tcp_md5sig_pool); - if (!pool) - return; for_each_possible_cpu(cpu) { - struct crypto_hash *hash; - - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) - goto out_free; + if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { + struct crypto_hash *hash; - per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; + hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(hash)) + return; + per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; + } } - /* before setting tcp_md5sig_pool, we must commit all writes - * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool() + /* before setting tcp_md5sig_pool_populated, we must commit all writes + * to memory. See smp_rmb() in tcp_get_md5sig_pool() */ smp_wmb(); - tcp_md5sig_pool = pool; - return; -out_free: - __tcp_free_md5sig_pool(pool); + tcp_md5sig_pool_populated = true; } bool tcp_alloc_md5sig_pool(void) { - if (unlikely(!tcp_md5sig_pool)) { + if (unlikely(!tcp_md5sig_pool_populated)) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool) + if (!tcp_md5sig_pool_populated) __tcp_alloc_md5sig_pool(); mutex_unlock(&tcp_md5sig_mutex); } - return tcp_md5sig_pool != NULL; + return tcp_md5sig_pool_populated; } EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -2936,13 +2917,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); */ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) { - struct tcp_md5sig_pool __percpu *p; - local_bh_disable(); - p = ACCESS_ONCE(tcp_md5sig_pool); - if (p) - return raw_cpu_ptr(p); + if (tcp_md5sig_pool_populated) { + /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */ + smp_rmb(); + return this_cpu_ptr(&tcp_md5sig_pool); + } local_bh_enable(); return NULL; } -- cgit v1.2.3-73-gaa49b From 65ba1f1ec0eff1c25933468e1d238201c0c2cb29 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 28 Oct 2014 10:30:34 +0100 Subject: inet: frags: fix a race between inet_evict_bucket and inet_frag_kill When the evictor is running it adds some chosen frags to a local list to be evicted once the chain lock has been released but at the same time the *frag_queue can be running for some of the same queues and it may call inet_frag_kill which will wait on the chain lock and will then delete the queue from the wrong list since it was added in the eviction one. The fix is simple - check if the queue has the evict flag set under the chain lock before deleting it, this is safe because the evict flag is set only under that lock and having the flag set also means that the queue has been detached from the chain list, so no need to delete it again. An important note to make is that we're safe w.r.t refcnt because inet_frag_kill and inet_evict_bucket will sync on the del_timer operation where only one of the two can succeed (or if the timer is executing - none of them), the cases are: 1. inet_frag_kill succeeds in del_timer - then the timer ref is removed, but inet_evict_bucket will not add this queue to its expire list but will restart eviction in that chain 2. inet_evict_bucket succeeds in del_timer - then the timer ref is kept until the evictor "expires" the queue, but inet_frag_kill will remove the initial ref and will set INET_FRAG_COMPLETE which will make the frag_expire fn just to remove its ref. In the end all of the queue users will do an inet_frag_put and the one that reaches 0 will free it. The refcount balance should be okay. CC: Florian Westphal CC: Eric Dumazet CC: Patrick McLean Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue") Suggested-by: Eric Dumazet Reported-by: Patrick McLean Tested-by: Patrick McLean Signed-off-by: Nikolay Aleksandrov Reviewed-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 9eb89f3f0ee4..894ec30c5896 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -285,7 +285,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) struct inet_frag_bucket *hb; hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); + if (!(fq->flags & INET_FRAG_EVICTED)) + hlist_del(&fq->list); spin_unlock(&hb->chain_lock); } -- cgit v1.2.3-73-gaa49b From d70127e8a942364de8dd140fe73893efda363293 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 28 Oct 2014 10:44:01 +0100 Subject: inet: frags: remove the WARN_ON from inet_evict_bucket The WARN_ON in inet_evict_bucket can be triggered by a valid case: inet_frag_kill and inet_evict_bucket can be running in parallel on the same queue which means that there has been at least one more ref added by a previous inet_frag_find call, but inet_frag_kill can delete the timer before inet_evict_bucket which will cause the WARN_ON() there to trigger since we'll have refcnt!=1. Now, this case is valid because the queue is being "killed" for some reason (removed from the chain list and its timer deleted) so it will get destroyed in the end by one of the inet_frag_put() calls which reaches 0 i.e. refcnt is still valid. CC: Florian Westphal CC: Eric Dumazet CC: Patrick McLean Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue") Reported-by: Patrick McLean Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 894ec30c5896..19419b60cb37 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -146,7 +146,6 @@ evict_again: atomic_inc(&fq->refcnt); spin_unlock(&hb->chain_lock); del_timer_sync(&fq->timer); - WARN_ON(atomic_read(&fq->refcnt) != 1); inet_frag_put(fq, f); goto evict_again; } -- cgit v1.2.3-73-gaa49b From fa19c2b050ab5254326f5fc07096dd3c6a8d5d58 Mon Sep 17 00:00:00 2001 From: Nicolas Cavallari Date: Thu, 30 Oct 2014 10:09:53 +0100 Subject: ipv4: Do not cache routing failures due to disabled forwarding. If we cache them, the kernel will reuse them, independently of whether forwarding is enabled or not. Which means that if forwarding is disabled on the input interface where the first routing request comes from, then that unreachable result will be cached and reused for other interfaces, even if forwarding is enabled on them. The opposite is also true. This can be verified with two interfaces A and B and an output interface C, where B has forwarding enabled, but not A and trying ip route get $dst iif A from $src && ip route get $dst iif B from $src Signed-off-by: Nicolas Cavallari Reviewed-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2d4ae469b471..6a2155b02602 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1798,6 +1798,7 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); res.type = RTN_UNREACHABLE; + res.fi = NULL; goto local_input; /* -- cgit v1.2.3-73-gaa49b From 14051f0452a2c26a3f4791e6ad6a435e8f1945ff Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 30 Oct 2014 08:40:56 -0700 Subject: gre: Use inner mac length when computing tunnel length Currently, skb_inner_network_header is used but this does not account for Ethernet header for ETH_P_TEB. Use skb_inner_mac_header which handles TEB and also should work with IP encapsulation in which case inner mac and inner network headers are the same. Tested: Ran TCP_STREAM over GRE, worked as expected. Signed-off-by: Tom Herbert Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/gre_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index f6e345c0bc23..bb5947b0ce2d 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -47,7 +47,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); - ghl = skb_inner_network_header(skb) - skb_transport_header(skb); + ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); if (unlikely(ghl < sizeof(*greh))) goto out; -- cgit v1.2.3-73-gaa49b From 39bb5e62867de82b269b07df900165029b928359 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Oct 2014 10:32:34 -0700 Subject: net: skb_fclone_busy() needs to detect orphaned skb Some drivers are unable to perform TX completions in a bound time. They instead call skb_orphan() Problem is skb_fclone_busy() has to detect this case, otherwise we block TCP retransmits and can freeze unlucky tcp sessions on mostly idle hosts. Signed-off-by: Eric Dumazet Fixes: 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++++++-- net/ipv4/tcp_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5884f95ff0e9..6c8b6f604e76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -799,15 +799,19 @@ struct sk_buff_fclones { * @skb: buffer * * Returns true is skb is a fast clone, and its clone is not freed. + * Some drivers call skb_orphan() in their ndo_start_xmit(), + * so we also check that this didnt happen. */ -static inline bool skb_fclone_busy(const struct sk_buff *skb) +static inline bool skb_fclone_busy(const struct sock *sk, + const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && - fclones->skb2.fclone == SKB_FCLONE_CLONE; + fclones->skb2.fclone == SKB_FCLONE_CLONE && + fclones->skb2.sk == sk; } static inline struct sk_buff *alloc_skb_fclone(unsigned int size, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..a3d453b94747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4c4e457e7888..88bf289abdc9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1962,7 +1962,7 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3-73-gaa49b From 052b9498eea532deb5de75277a53f6e0623215dc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 25 Oct 2014 18:24:57 +0200 Subject: netfilter: nf_reject_ipv4: split nf_send_reset() in smaller functions That can be reused by the reject bridge expression to build the reject packet. The new functions are: * nf_reject_ip_tcphdr_get(): to sanitize and to obtain the TCP header. * nf_reject_iphdr_put(): to build the IPv4 header. * nf_reject_ip_tcphdr_put(): to build the TCP header. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 10 ++++ net/ipv4/netfilter/nf_reject_ipv4.c | 88 ++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index e8427193c777..03e928a55229 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -1,6 +1,8 @@ #ifndef _IPV4_NF_REJECT_H #define _IPV4_NF_REJECT_H +#include +#include #include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) @@ -10,4 +12,12 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code) void nf_send_reset(struct sk_buff *oldskb, int hook); +const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); +struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int ttl); +void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); + #endif /* _IPV4_NF_REJECT_H */ diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 92b303dbd5fc..1baaa83dfe5c 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,43 +12,39 @@ #include #include #include +#include -/* Send RST reply */ -void nf_send_reset(struct sk_buff *oldskb, int hook) +const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { - struct sk_buff *nskb; - const struct iphdr *oiph; - struct iphdr *niph; const struct tcphdr *oth; - struct tcphdr _otcph, *tcph; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) - return; + return NULL; oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), - sizeof(_otcph), &_otcph); + sizeof(struct tcphdr), _oth); if (oth == NULL) - return; + return NULL; /* No RST for RST. */ if (oth->rst) - return; - - if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return; + return NULL; /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) - return; - oiph = ip_hdr(oldskb); + return NULL; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + - LL_MAX_HEADER, GFP_ATOMIC); - if (!nskb) - return; + return oth; +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); - skb_reserve(nskb, LL_MAX_HEADER); +struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __be16 protocol, int ttl) +{ + struct iphdr *niph, *oiph = ip_hdr(oldskb); skb_reset_network_header(nskb); niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); @@ -57,10 +53,23 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) niph->tos = 0; niph->id = 0; niph->frag_off = htons(IP_DF); - niph->protocol = IPPROTO_TCP; + niph->protocol = protocol; niph->check = 0; niph->saddr = oiph->daddr; niph->daddr = oiph->saddr; + niph->ttl = ttl; + + nskb->protocol = htons(ETH_P_IP); + + return niph; +} +EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); + +void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) +{ + struct iphdr *niph = ip_hdr(nskb); + struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); @@ -69,9 +78,9 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) tcph->dest = oth->source; tcph->doff = sizeof(struct tcphdr) / 4; - if (oth->ack) + if (oth->ack) { tcph->seq = oth->ack_seq; - else { + } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); @@ -84,16 +93,43 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); +} +EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); + +/* Send RST reply */ +void nf_send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + const struct iphdr *oiph; + struct iphdr *niph; + const struct tcphdr *oth; + struct tcphdr _oth; + + oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); + if (!oth) + return; + + if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + return; + + oiph = ip_hdr(oldskb); + + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); + if (!nskb) + return; /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); - nskb->protocol = htons(ETH_P_IP); + skb_reserve(nskb, LL_MAX_HEADER); + niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, + ip4_dst_hoplimit(skb_dst(nskb))); + nf_reject_ip_tcphdr_put(nskb, oldskb, oth); + if (ip_route_me_harder(nskb, RTN_UNSPEC)) goto free_nskb; - niph->ttl = ip4_dst_hoplimit(skb_dst(nskb)); - /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; -- cgit v1.2.3-73-gaa49b