From ab2d7251d666995740da17b2a51ca545ac5dd037 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Oct 2014 11:25:20 +0200
Subject: netfilter: missing module license in the nf_reject_ipvX modules

[   23.545204] nf_reject_ipv4: module license 'unspecified' taints kernel.

Fixes: c8d7b98 ("netfilter: move nf_send_resetX() code to nf_reject_ipvX modules")
Reported-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_reject_ipv4.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index b023b4eb1a96..92b303dbd5fc 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/module.h>
 #include <net/ip.h>
 #include <net/tcp.h>
 #include <net/route.h>
@@ -125,3 +126,5 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	kfree_skb(nskb);
 }
 EXPORT_SYMBOL_GPL(nf_send_reset);
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-73-gaa49b


From 7210e4e38f945dfa173c4a4e59ad827c9ecad541 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 13 Oct 2014 19:50:22 +0200
Subject: netfilter: nf_tables: restrict nat/masq expressions to nat chain type

This adds the missing validation code to avoid the use of nat/masq from
non-nat chains. The validation assumes two possible configuration
scenarios:

1) Use of nat from base chain that is not of nat type. Reject this
   configuration from the nft_*_init() path of the expression.

2) Use of nat from non-base chain. In this case, we have to wait until
   the non-base chain is referenced by at least one base chain via
   jump/goto. This is resolved from the nft_*_validate() path which is
   called from nf_tables_check_loops().

The user gets an -EOPNOTSUPP in both cases.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h  |  3 +++
 include/net/netfilter/nft_masq.h   |  3 +++
 net/ipv4/netfilter/nft_masq_ipv4.c |  1 +
 net/ipv6/netfilter/nft_masq_ipv6.c |  1 +
 net/netfilter/nf_tables_api.c      | 14 ++++++++++++++
 net/netfilter/nft_masq.c           | 12 ++++++++++++
 net/netfilter/nft_nat.c            | 12 ++++++++++++
 7 files changed, 46 insertions(+)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3d7292392fac..845c596bf594 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -530,6 +530,9 @@ enum nft_chain_type {
 	NFT_CHAIN_T_MAX
 };
 
+int nft_chain_validate_dependency(const struct nft_chain *chain,
+				  enum nft_chain_type type);
+
 struct nft_stats {
 	u64			bytes;
 	u64			pkts;
diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h
index c72729f954f4..e2a518b60e19 100644
--- a/include/net/netfilter/nft_masq.h
+++ b/include/net/netfilter/nft_masq.h
@@ -13,4 +13,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 
 int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr);
 
+int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		      const struct nft_data **data);
+
 #endif /* _NFT_MASQ_H_ */
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index 1c636d6b5b50..c1023c445920 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv4_ops = {
 	.eval		= nft_masq_ipv4_eval,
 	.init		= nft_masq_init,
 	.dump		= nft_masq_dump,
+	.validate	= nft_masq_validate,
 };
 
 static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 556262f40761..8a7ac685076d 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -39,6 +39,7 @@ static const struct nft_expr_ops nft_masq_ipv6_ops = {
 	.eval		= nft_masq_ipv6_eval,
 	.init		= nft_masq_init,
 	.dump		= nft_masq_dump,
+	.validate	= nft_masq_validate,
 };
 
 static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 556a0dfa4abc..65eb2a1160d5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3744,6 +3744,20 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.abort		= nf_tables_abort,
 };
 
+int nft_chain_validate_dependency(const struct nft_chain *chain,
+				  enum nft_chain_type type)
+{
+	const struct nft_base_chain *basechain;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		basechain = nft_base_chain(chain);
+		if (basechain->type->type != type)
+			return -EOPNOTSUPP;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate_dependency);
+
 /*
  * Loop detection - walk through the ruleset beginning at the destination chain
  * of a new jump until either the source chain is reached (loop) or all
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 6637bab00567..d1ffd5eb3a9b 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -26,6 +26,11 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nlattr * const tb[])
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
+	int err;
+
+	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+	if (err < 0)
+		return err;
 
 	if (tb[NFTA_MASQ_FLAGS] == NULL)
 		return 0;
@@ -55,5 +60,12 @@ nla_put_failure:
 }
 EXPORT_SYMBOL_GPL(nft_masq_dump);
 
+int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		      const struct nft_data **data)
+{
+	return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+}
+EXPORT_SYMBOL_GPL(nft_masq_validate);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 799550b476fb..0f0af6e86fb8 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -95,6 +95,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	u32 family;
 	int err;
 
+	err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+	if (err < 0)
+		return err;
+
 	if (tb[NFTA_NAT_TYPE] == NULL)
 		return -EINVAL;
 
@@ -205,6 +209,13 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_nat_validate(const struct nft_ctx *ctx,
+			    const struct nft_expr *expr,
+			    const struct nft_data **data)
+{
+	return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+}
+
 static struct nft_expr_type nft_nat_type;
 static const struct nft_expr_ops nft_nat_ops = {
 	.type           = &nft_nat_type,
@@ -212,6 +223,7 @@ static const struct nft_expr_ops nft_nat_ops = {
 	.eval           = nft_nat_eval,
 	.init           = nft_nat_init,
 	.dump           = nft_nat_dump,
+	.validate	= nft_nat_validate,
 };
 
 static struct nft_expr_type nft_nat_type __read_mostly = {
-- 
cgit v1.2.3-73-gaa49b


From 1e16aa3ddf863c6b9f37eddf52503230a62dedb3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 Oct 2014 13:49:16 +0200
Subject: net: gso: use feature flag argument in all protocol gso handlers

skb_gso_segment() has a 'features' argument representing offload features
available to the output path.

A few handlers, e.g. GRE, instead re-fetch the features of skb->dev and use
those instead of the provided ones when handing encapsulation/tunnels.

Depending on dev->hw_enc_features of the output device skb_gso_segment() can
then return NULL even when the caller has disabled all GSO feature bits,
as segmentation of inner header thinks device will take care of segmentation.

This e.g. affects the tbf scheduler, which will silently drop GRE-encap GSO skbs
that did not fit the remaining token quota as the segmentation does not work
when device supports corresponding hw offload capabilities.

Cc: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c     | 2 +-
 net/ipv4/gre_offload.c | 2 +-
 net/ipv4/udp_offload.c | 2 +-
 net/ipv6/ip6_offload.c | 2 +-
 net/mpls/mpls_gso.c    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 92db7a69f2b9..8b7fe5b03906 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1246,7 +1246,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 
 	encap = SKB_GSO_CB(skb)->encap_level > 0;
 	if (encap)
-		features = skb->dev->hw_enc_features & netif_skb_features(skb);
+		features &= skb->dev->hw_enc_features;
 	SKB_GSO_CB(skb)->encap_level += ihl;
 
 	skb_reset_transport_header(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ccda09628de7..f6e345c0bc23 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	skb->mac_len = skb_inner_network_offset(skb);
 
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	enc_features = skb->dev->hw_enc_features & features;
 	segs = skb_mac_gso_segment(skb, enc_features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 507310ef4b56..6480cea7aa53 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -58,7 +58,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 		skb->encap_hdr_csum = 1;
 
 	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	enc_features = skb->dev->hw_enc_features & features;
 	segs = gso_inner_segment(skb, enc_features);
 	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 91014d32488d..a071563a7e6e 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -90,7 +90,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	encap = SKB_GSO_CB(skb)->encap_level > 0;
 	if (encap)
-		features = skb->dev->hw_enc_features & netif_skb_features(skb);
+		features &= skb->dev->hw_enc_features;
 	SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h);
 
 	ipv6h = ipv6_hdr(skb);
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index e28ed2ef5b06..f0f5309a2d72 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -48,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
 	__skb_push(skb, skb->mac_len);
 
 	/* Segment inner packet. */
-	mpls_features = skb->dev->mpls_features & netif_skb_features(skb);
+	mpls_features = skb->dev->mpls_features & features;
 	segs = skb_mac_gso_segment(skb, mpls_features);
 
 
-- 
cgit v1.2.3-73-gaa49b


From 330966e501ffe282d7184fde4518d5e0c24bc7f8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 Oct 2014 13:49:17 +0200
Subject: net: make skb_gso_segment error handling more robust

skb_gso_segment has three possible return values:
1. a pointer to the first segmented skb
2. an errno value (IS_ERR())
3. NULL.  This can happen when GSO is used for header verification.

However, several callers currently test IS_ERR instead of IS_ERR_OR_NULL
and would oops when NULL is returned.

Note that these call sites should never actually see such a NULL return
value; all callers mask out the GSO bits in the feature argument.

However, there have been issues with some protocol handlers erronously not
respecting the specified feature mask in some cases.

It is preferable to get 'have to turn off hw offloading, else slow' reports
rather than 'kernel crashes'.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c                 | 2 +-
 net/netfilter/nfnetlink_queue_core.c | 2 +-
 net/openvswitch/datapath.c           | 2 ++
 net/xfrm/xfrm_output.c               | 2 ++
 4 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 88e5ef2c7f51..bc6471d4abcd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -231,7 +231,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
 	 */
 	features = netif_skb_features(skb);
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
-	if (IS_ERR(segs)) {
+	if (IS_ERR_OR_NULL(segs)) {
 		kfree_skb(skb);
 		return -ENOMEM;
 	}
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index a82077d9f59b..7c60ccd61a3e 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -665,7 +665,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	 * returned by nf_queue.  For instance, callers rely on -ECANCELED to
 	 * mean 'ignore this hook'.
 	 */
-	if (IS_ERR(segs))
+	if (IS_ERR_OR_NULL(segs))
 		goto out_err;
 	queued = 0;
 	err = 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2e31d9e7f4dc..e6d7255183eb 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -324,6 +324,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
 	if (IS_ERR(segs))
 		return PTR_ERR(segs);
+	if (segs == NULL)
+		return -EINVAL;
 
 	/* Queue all of the segments. */
 	skb = segs;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 499d6c18a8ce..7c532856b398 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -157,6 +157,8 @@ static int xfrm_output_gso(struct sk_buff *skb)
 	kfree_skb(skb);
 	if (IS_ERR(segs))
 		return PTR_ERR(segs);
+	if (segs == NULL)
+		return -EINVAL;
 
 	do {
 		struct sk_buff *nskb = segs->next;
-- 
cgit v1.2.3-73-gaa49b


From 9e7ceb060754f134231f68cb29d5db31419fe1ed Mon Sep 17 00:00:00 2001
From: Sathya Perla <sathya.perla@emulex.com>
Date: Wed, 22 Oct 2014 21:42:01 +0530
Subject: net: fix saving TX flow hash in sock for outgoing connections

The commit "net: Save TX flow hash in sock and set in skbuf on xmit"
introduced the inet_set_txhash() and ip6_set_txhash() routines to calculate
and record flow hash(sk_txhash) in the socket structure. sk_txhash is used
to set skb->hash which is used to spread flows across multiple TXQs.

But, the above routines are invoked before the source port of the connection
is created. Because of this all outgoing connections that just differ in the
source port get hashed into the same TXQ.

This patch fixes this problem for IPv4/6 by invoking the the above routines
after the source port is available for the socket.

Fixes: b73c3d0e4("net: Save TX flow hash in sock and set in skbuf on xmit")

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 net/ipv6/tcp_ipv6.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 94d1a7757ff7..9c7d7621466b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -206,8 +206,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_dport = usin->sin_port;
 	inet->inet_daddr = daddr;
 
-	inet_set_txhash(sk);
-
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet_opt)
 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -224,6 +222,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		goto failure;
 
+	inet_set_txhash(sk);
+
 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 			       inet->inet_sport, inet->inet_dport, sk);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 831495529b82..ace29b60813c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -200,8 +200,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_v6_daddr = usin->sin6_addr;
 	np->flow_label = fl6.flowlabel;
 
-	ip6_set_txhash(sk);
-
 	/*
 	 *	TCP over IPv4
 	 */
@@ -297,6 +295,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (err)
 		goto late_failure;
 
+	ip6_set_txhash(sk);
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
-- 
cgit v1.2.3-73-gaa49b


From 349ce993ac706869d553a1816426d3a4bfda02b1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Oct 2014 12:58:58 -0700
Subject: tcp: md5: do not use alloc_percpu()

percpu tcp_md5sig_pool contains memory blobs that ultimately
go through sg_set_buf().

-> sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));

This requires that whole area is in a physically contiguous portion
of memory. And that @buf is not backed by vmalloc().

Given that alloc_percpu() can use vmalloc() areas, this does not
fit the requirements.

Replace alloc_percpu() by a static DEFINE_PER_CPU() as tcp_md5sig_pool
is small anyway, there is no gain to dynamically allocate it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 765cf9976e93 ("tcp: md5: remove one indirection level in tcp_md5sig_pool")
Reported-by: Crestez Dan Leonard <cdleonard@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 59 ++++++++++++++++++++--------------------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1bec4e76d88c..39ec0c379545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2868,61 +2868,42 @@ EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
 
 #ifdef CONFIG_TCP_MD5SIG
-static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool __read_mostly;
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
 static DEFINE_MUTEX(tcp_md5sig_mutex);
-
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu);
-
-		if (p->md5_desc.tfm)
-			crypto_free_hash(p->md5_desc.tfm);
-	}
-	free_percpu(pool);
-}
+static bool tcp_md5sig_pool_populated = false;
 
 static void __tcp_alloc_md5sig_pool(void)
 {
 	int cpu;
-	struct tcp_md5sig_pool __percpu *pool;
-
-	pool = alloc_percpu(struct tcp_md5sig_pool);
-	if (!pool)
-		return;
 
 	for_each_possible_cpu(cpu) {
-		struct crypto_hash *hash;
-
-		hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-		if (IS_ERR_OR_NULL(hash))
-			goto out_free;
+		if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
+			struct crypto_hash *hash;
 
-		per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash;
+			hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+			if (IS_ERR_OR_NULL(hash))
+				return;
+			per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
+		}
 	}
-	/* before setting tcp_md5sig_pool, we must commit all writes
-	 * to memory. See ACCESS_ONCE() in tcp_get_md5sig_pool()
+	/* before setting tcp_md5sig_pool_populated, we must commit all writes
+	 * to memory. See smp_rmb() in tcp_get_md5sig_pool()
 	 */
 	smp_wmb();
-	tcp_md5sig_pool = pool;
-	return;
-out_free:
-	__tcp_free_md5sig_pool(pool);
+	tcp_md5sig_pool_populated = true;
 }
 
 bool tcp_alloc_md5sig_pool(void)
 {
-	if (unlikely(!tcp_md5sig_pool)) {
+	if (unlikely(!tcp_md5sig_pool_populated)) {
 		mutex_lock(&tcp_md5sig_mutex);
 
-		if (!tcp_md5sig_pool)
+		if (!tcp_md5sig_pool_populated)
 			__tcp_alloc_md5sig_pool();
 
 		mutex_unlock(&tcp_md5sig_mutex);
 	}
-	return tcp_md5sig_pool != NULL;
+	return tcp_md5sig_pool_populated;
 }
 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
 
@@ -2936,13 +2917,13 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
  */
 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 {
-	struct tcp_md5sig_pool __percpu *p;
-
 	local_bh_disable();
-	p = ACCESS_ONCE(tcp_md5sig_pool);
-	if (p)
-		return raw_cpu_ptr(p);
 
+	if (tcp_md5sig_pool_populated) {
+		/* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+		smp_rmb();
+		return this_cpu_ptr(&tcp_md5sig_pool);
+	}
 	local_bh_enable();
 	return NULL;
 }
-- 
cgit v1.2.3-73-gaa49b


From 65ba1f1ec0eff1c25933468e1d238201c0c2cb29 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@redhat.com>
Date: Tue, 28 Oct 2014 10:30:34 +0100
Subject: inet: frags: fix a race between inet_evict_bucket and inet_frag_kill

When the evictor is running it adds some chosen frags to a local list to
be evicted once the chain lock has been released but at the same time
the *frag_queue can be running for some of the same queues and it
may call inet_frag_kill which will wait on the chain lock and
will then delete the queue from the wrong list since it was added in the
eviction one. The fix is simple - check if the queue has the evict flag
set under the chain lock before deleting it, this is safe because the
evict flag is set only under that lock and having the flag set also means
that the queue has been detached from the chain list, so no need to delete
it again.
An important note to make is that we're safe w.r.t refcnt because
inet_frag_kill and inet_evict_bucket will sync on the del_timer operation
where only one of the two can succeed (or if the timer is executing -
none of them), the cases are:
1. inet_frag_kill succeeds in del_timer
 - then the timer ref is removed, but inet_evict_bucket will not add
   this queue to its expire list but will restart eviction in that chain
2. inet_evict_bucket succeeds in del_timer
 - then the timer ref is kept until the evictor "expires" the queue, but
   inet_frag_kill will remove the initial ref and will set
   INET_FRAG_COMPLETE which will make the frag_expire fn just to remove
   its ref.
In the end all of the queue users will do an inet_frag_put and the one
that reaches 0 will free it. The refcount balance should be okay.

CC: Florian Westphal <fw@strlen.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McLean <chutzpah@gentoo.org>

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Patrick McLean <chutzpah@gentoo.org>
Tested-by: Patrick McLean <chutzpah@gentoo.org>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 9eb89f3f0ee4..894ec30c5896 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -285,7 +285,8 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	struct inet_frag_bucket *hb;
 
 	hb = get_frag_bucket_locked(fq, f);
-	hlist_del(&fq->list);
+	if (!(fq->flags & INET_FRAG_EVICTED))
+		hlist_del(&fq->list);
 	spin_unlock(&hb->chain_lock);
 }
 
-- 
cgit v1.2.3-73-gaa49b


From d70127e8a942364de8dd140fe73893efda363293 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@redhat.com>
Date: Tue, 28 Oct 2014 10:44:01 +0100
Subject: inet: frags: remove the WARN_ON from inet_evict_bucket

The WARN_ON in inet_evict_bucket can be triggered by a valid case:
inet_frag_kill and inet_evict_bucket can be running in parallel on the
same queue which means that there has been at least one more ref added
by a previous inet_frag_find call, but inet_frag_kill can delete the
timer before inet_evict_bucket which will cause the WARN_ON() there to
trigger since we'll have refcnt!=1. Now, this case is valid because the
queue is being "killed" for some reason (removed from the chain list and
its timer deleted) so it will get destroyed in the end by one of the
inet_frag_put() calls which reaches 0 i.e. refcnt is still valid.

CC: Florian Westphal <fw@strlen.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McLean <chutzpah@gentoo.org>

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Reported-by: Patrick McLean <chutzpah@gentoo.org>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 894ec30c5896..19419b60cb37 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -146,7 +146,6 @@ evict_again:
 			atomic_inc(&fq->refcnt);
 			spin_unlock(&hb->chain_lock);
 			del_timer_sync(&fq->timer);
-			WARN_ON(atomic_read(&fq->refcnt) != 1);
 			inet_frag_put(fq, f);
 			goto evict_again;
 		}
-- 
cgit v1.2.3-73-gaa49b


From fa19c2b050ab5254326f5fc07096dd3c6a8d5d58 Mon Sep 17 00:00:00 2001
From: Nicolas Cavallari <nicolas.cavallari@green-communications.fr>
Date: Thu, 30 Oct 2014 10:09:53 +0100
Subject: ipv4: Do not cache routing failures due to disabled forwarding.

If we cache them, the kernel will reuse them, independently of
whether forwarding is enabled or not.  Which means that if forwarding is
disabled on the input interface where the first routing request comes
from, then that unreachable result will be cached and reused for
other interfaces, even if forwarding is enabled on them.  The opposite
is also true.

This can be verified with two interfaces A and B and an output interface
C, where B has forwarding enabled, but not A and trying
ip route get $dst iif A from $src && ip route get $dst iif B from $src

Signed-off-by: Nicolas Cavallari <nicolas.cavallari@green-communications.fr>
Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2d4ae469b471..6a2155b02602 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1798,6 +1798,7 @@ local_input:
 no_route:
 	RT_CACHE_STAT_INC(in_no_route);
 	res.type = RTN_UNREACHABLE;
+	res.fi = NULL;
 	goto local_input;
 
 	/*
-- 
cgit v1.2.3-73-gaa49b


From 14051f0452a2c26a3f4791e6ad6a435e8f1945ff Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Thu, 30 Oct 2014 08:40:56 -0700
Subject: gre: Use inner mac length when computing tunnel length

Currently, skb_inner_network_header is used but this does not account
for Ethernet header for ETH_P_TEB. Use skb_inner_mac_header which
handles TEB and also should work with IP encapsulation in which case
inner mac and inner network headers are the same.

Tested: Ran TCP_STREAM over GRE, worked as expected.

Signed-off-by: Tom Herbert <therbert@google.com>
Acked-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f6e345c0bc23..bb5947b0ce2d 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -47,7 +47,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 
 	greh = (struct gre_base_hdr *)skb_transport_header(skb);
 
-	ghl = skb_inner_network_header(skb) - skb_transport_header(skb);
+	ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
 	if (unlikely(ghl < sizeof(*greh)))
 		goto out;
 
-- 
cgit v1.2.3-73-gaa49b


From 39bb5e62867de82b269b07df900165029b928359 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 30 Oct 2014 10:32:34 -0700
Subject: net: skb_fclone_busy() needs to detect orphaned skb

Some drivers are unable to perform TX completions in a bound time.
They instead call skb_orphan()

Problem is skb_fclone_busy() has to detect this case, otherwise
we block TCP retransmits and can freeze unlucky tcp sessions on
mostly idle hosts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 ++++++--
 net/ipv4/tcp_output.c  | 2 +-
 net/xfrm/xfrm_policy.c | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5884f95ff0e9..6c8b6f604e76 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -799,15 +799,19 @@ struct sk_buff_fclones {
  *	@skb: buffer
  *
  * Returns true is skb is a fast clone, and its clone is not freed.
+ * Some drivers call skb_orphan() in their ndo_start_xmit(),
+ * so we also check that this didnt happen.
  */
-static inline bool skb_fclone_busy(const struct sk_buff *skb)
+static inline bool skb_fclone_busy(const struct sock *sk,
+				   const struct sk_buff *skb)
 {
 	const struct sk_buff_fclones *fclones;
 
 	fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
 	return skb->fclone == SKB_FCLONE_ORIG &&
-	       fclones->skb2.fclone == SKB_FCLONE_CLONE;
+	       fclones->skb2.fclone == SKB_FCLONE_CLONE &&
+	       fclones->skb2.sk == sk;
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3af21296d967..a3d453b94747 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 static bool skb_still_in_host_queue(const struct sock *sk,
 				    const struct sk_buff *skb)
 {
-	if (unlikely(skb_fclone_busy(skb))) {
+	if (unlikely(skb_fclone_busy(sk, skb))) {
 		NET_INC_STATS_BH(sock_net(sk),
 				 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
 		return true;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4c4e457e7888..88bf289abdc9 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1962,7 +1962,7 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb)
 	struct xfrm_policy *pol = xdst->pols[0];
 	struct xfrm_policy_queue *pq = &pol->polq;
 
-	if (unlikely(skb_fclone_busy(skb))) {
+	if (unlikely(skb_fclone_busy(sk, skb))) {
 		kfree_skb(skb);
 		return 0;
 	}
-- 
cgit v1.2.3-73-gaa49b


From 052b9498eea532deb5de75277a53f6e0623215dc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 25 Oct 2014 18:24:57 +0200
Subject: netfilter: nf_reject_ipv4: split nf_send_reset() in smaller functions

That can be reused by the reject bridge expression to build the reject
packet. The new functions are:

* nf_reject_ip_tcphdr_get(): to sanitize and to obtain the TCP header.
* nf_reject_iphdr_put(): to build the IPv4 header.
* nf_reject_ip_tcphdr_put(): to build the TCP header.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_reject.h | 10 ++++
 net/ipv4/netfilter/nf_reject_ipv4.c    | 88 ++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h
index e8427193c777..03e928a55229 100644
--- a/include/net/netfilter/ipv4/nf_reject.h
+++ b/include/net/netfilter/ipv4/nf_reject.h
@@ -1,6 +1,8 @@
 #ifndef _IPV4_NF_REJECT_H
 #define _IPV4_NF_REJECT_H
 
+#include <linux/skbuff.h>
+#include <net/ip.h>
 #include <net/icmp.h>
 
 static inline void nf_send_unreach(struct sk_buff *skb_in, int code)
@@ -10,4 +12,12 @@ static inline void nf_send_unreach(struct sk_buff *skb_in, int code)
 
 void nf_send_reset(struct sk_buff *oldskb, int hook);
 
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+					     struct tcphdr *_oth, int hook);
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+				  const struct sk_buff *oldskb,
+				  __be16 protocol, int ttl);
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+			     const struct tcphdr *oth);
+
 #endif /* _IPV4_NF_REJECT_H */
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 92b303dbd5fc..1baaa83dfe5c 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,43 +12,39 @@
 #include <net/route.h>
 #include <net/dst.h>
 #include <linux/netfilter_ipv4.h>
+#include <net/netfilter/ipv4/nf_reject.h>
 
-/* Send RST reply */
-void nf_send_reset(struct sk_buff *oldskb, int hook)
+const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+					     struct tcphdr *_oth, int hook)
 {
-	struct sk_buff *nskb;
-	const struct iphdr *oiph;
-	struct iphdr *niph;
 	const struct tcphdr *oth;
-	struct tcphdr _otcph, *tcph;
 
 	/* IP header checks: fragment. */
 	if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
-		return;
+		return NULL;
 
 	oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
-				 sizeof(_otcph), &_otcph);
+				 sizeof(struct tcphdr), _oth);
 	if (oth == NULL)
-		return;
+		return NULL;
 
 	/* No RST for RST. */
 	if (oth->rst)
-		return;
-
-	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
-		return;
+		return NULL;
 
 	/* Check checksum */
 	if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
-		return;
-	oiph = ip_hdr(oldskb);
+		return NULL;
 
-	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
-			 LL_MAX_HEADER, GFP_ATOMIC);
-	if (!nskb)
-		return;
+	return oth;
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
 
-	skb_reserve(nskb, LL_MAX_HEADER);
+struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+				  const struct sk_buff *oldskb,
+				  __be16 protocol, int ttl)
+{
+	struct iphdr *niph, *oiph = ip_hdr(oldskb);
 
 	skb_reset_network_header(nskb);
 	niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
@@ -57,10 +53,23 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	niph->tos	= 0;
 	niph->id	= 0;
 	niph->frag_off	= htons(IP_DF);
-	niph->protocol	= IPPROTO_TCP;
+	niph->protocol	= protocol;
 	niph->check	= 0;
 	niph->saddr	= oiph->daddr;
 	niph->daddr	= oiph->saddr;
+	niph->ttl	= ttl;
+
+	nskb->protocol = htons(ETH_P_IP);
+
+	return niph;
+}
+EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
+
+void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+			  const struct tcphdr *oth)
+{
+	struct iphdr *niph = ip_hdr(nskb);
+	struct tcphdr *tcph;
 
 	skb_reset_transport_header(nskb);
 	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
@@ -69,9 +78,9 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	tcph->dest	= oth->source;
 	tcph->doff	= sizeof(struct tcphdr) / 4;
 
-	if (oth->ack)
+	if (oth->ack) {
 		tcph->seq = oth->ack_seq;
-	else {
+	} else {
 		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
 				      oldskb->len - ip_hdrlen(oldskb) -
 				      (oth->doff << 2));
@@ -84,16 +93,43 @@ void nf_send_reset(struct sk_buff *oldskb, int hook)
 	nskb->ip_summed = CHECKSUM_PARTIAL;
 	nskb->csum_start = (unsigned char *)tcph - nskb->head;
 	nskb->csum_offset = offsetof(struct tcphdr, check);
+}
+EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+/* Send RST reply */
+void nf_send_reset(struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	const struct iphdr *oiph;
+	struct iphdr *niph;
+	const struct tcphdr *oth;
+	struct tcphdr _oth;
+
+	oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+	if (!oth)
+		return;
+
+	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		return;
+
+	oiph = ip_hdr(oldskb);
+
+	nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+			 LL_MAX_HEADER, GFP_ATOMIC);
+	if (!nskb)
+		return;
 
 	/* ip_route_me_harder expects skb->dst to be set */
 	skb_dst_set_noref(nskb, skb_dst(oldskb));
 
-	nskb->protocol = htons(ETH_P_IP);
+	skb_reserve(nskb, LL_MAX_HEADER);
+	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+				   ip4_dst_hoplimit(skb_dst(nskb)));
+	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+
 	if (ip_route_me_harder(nskb, RTN_UNSPEC))
 		goto free_nskb;
 
-	niph->ttl	= ip4_dst_hoplimit(skb_dst(nskb));
-
 	/* "Never happens" */
 	if (nskb->len > dst_mtu(skb_dst(nskb)))
 		goto free_nskb;
-- 
cgit v1.2.3-73-gaa49b