19 files changed, 155 insertions, 30 deletions
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
new file mode 100644
index 000000000000..031ef4a63485
--- /dev/null
+++ b/Documentation/networking/vrf.txt
@@ -0,0 +1,96 @@
+Virtual Routing and Forwarding (VRF)
+====================================
+The VRF device combined with ip rules provides the ability to create virtual
+routing and forwarding domains (aka VRFs, VRF-lite to be specific) in the
+Linux network stack. One use case is the multi-tenancy problem where each
+tenant has their own unique routing tables and in the very least need
+different default gateways.
+
+Processes can be "VRF aware" by binding a socket to the VRF device. Packets
+through the socket then use the routing table associated with the VRF
+device. An important feature of the VRF device implementation is that it
+impacts only Layer 3 and above so L2 tools (e.g., LLDP) are not affected
+(ie., they do not need to be run in each VRF). The design also allows
+the use of higher priority ip rules (Policy Based Routing, PBR) to take
+precedence over the VRF device rules directing specific traffic as desired.
+
+In addition, VRF devices allow VRFs to be nested within namespaces. For
+example network namespaces provide separation of network interfaces at L1
+(Layer 1 separation), VLANs on the interfaces within a namespace provide
+L2 separation and then VRF devices provide L3 separation.
+
+Design
+------
+A VRF device is created with an associated route table. Network interfaces
+are then enslaved to a VRF device:
+
+         +-----------------------------+
+         |           vrf-blue          |  ===> route table 10
+         +-----------------------------+
+            |        |            |
+         +------+ +------+     +-------------+
+         | eth1 | | eth2 | ... |    bond1    |
+         +------+ +------+     +-------------+
+                                  |       |
+                              +------+ +------+
+                              | eth8 | | eth9 |
+                              +------+ +------+
+
+Packets received on an enslaved device and are switched to the VRF device
+using an rx_handler which gives the impression that packets flow through
+the VRF device. Similarly on egress routing rules are used to send packets
+to the VRF device driver before getting sent out the actual interface. This
+allows tcpdump on a VRF device to capture all packets into and out of the
+VRF as a whole.[1] Similiarly, netfilter [2] and tc rules can be applied
+using the VRF device to specify rules that apply to the VRF domain as a whole.
+
+[1] Packets in the forwarded state do not flow through the device, so those
+    packets are not seen by tcpdump. Will revisit this limitation in a
+    future release.
+
+[2] Iptables on ingress is limited to NF_INET_PRE_ROUTING only with skb->dev
+    set to real ingress device and egress is limited to NF_INET_POST_ROUTING.
+    Will revisit this limitation in a future release.
+
+
+Setup
+-----
+1. VRF device is created with an association to a FIB table.
+   e.g, ip link add vrf-blue type vrf table 10
+        ip link set dev vrf-blue up
+
+2. Rules are added that send lookups to the associated FIB table when the
+   iif or oif is the VRF device. e.g.,
+       ip ru add oif vrf-blue table 10
+       ip ru add iif vrf-blue table 10
+
+   Set the default route for the table (and hence default route for the VRF).
+   e.g, ip route add table 10 prohibit default
+
+3. Enslave L3 interfaces to a VRF device.
+   e.g,  ip link set dev eth1 master vrf-blue
+
+   Local and connected routes for enslaved devices are automatically moved to
+   the table associated with VRF device. Any additional routes depending on
+   the enslaved device will need to be reinserted following the enslavement.
+
+4. Additional VRF routes are added to associated table.
+   e.g., ip route add table 10 ...
+
+
+Applications
+------------
+Applications that are to work within a VRF need to bind their socket to the
+VRF device:
+
+    setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, dev, strlen(dev)+1);
+
+or to specify the output device using cmsg and IP_PKTINFO.
+
+
+Limitations
+-----------
+VRF device currently only works for IPv4. Support for IPv6 is under development.
+
+Index of original ingress interface is not available via cmsg. Will address
+soon.
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 6294b5186ae5..809ab6efcc74 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -54,13 +54,15 @@ default_qdisc
 --------------
 
 The default queuing discipline to use for network devices. This allows
-overriding the default queue discipline of pfifo_fast with an
-alternative. Since the default queuing discipline is created with the
-no additional parameters so is best suited to queuing disciplines that
-work well without configuration like stochastic fair queue (sfq),
-CoDel (codel) or fair queue CoDel (fq_codel). Don't use queuing disciplines
-like Hierarchical Token Bucket or Deficit Round Robin which require setting
-up classes and bandwidths.
+overriding the default of pfifo_fast with an alternative. Since the default
+queuing discipline is created without additional parameters so is best suited
+to queuing disciplines that work well without configuration like stochastic
+fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use
+queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin
+which require setting up classes and bandwidths. Note that physical multiqueue
+interfaces still use mq as root qdisc, which in turn uses this default for its
+leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead
+default to noqueue.
 Default: pfifo_fast
 
 busy_read
diff --git a/MAINTAINERS b/MAINTAINERS
index 310da4295c70..d4d9d4f6d271 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11243,6 +11243,7 @@ L:	[email protected]
 S:	Maintained
 F:	drivers/net/vrf.c
 F:	include/net/vrf.h
+F:	Documentation/networking/vrf.txt
 
 VT1211 HARDWARE MONITOR DRIVER
 M:	Juerg Haefliger <[email protected]>
diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 74e18b0a6d89..3d7fb6516f74 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -805,7 +805,12 @@ static void solos_bh(unsigned long card_arg)
 					continue;
 				}
 
-				skb = alloc_skb(size + 1, GFP_ATOMIC);
+				/* Use netdev_alloc_skb() because it adds NET_SKB_PAD of
+				 * headroom, and ensures we can route packets back out an
+				 * Ethernet interface (for example) without having to
+				 * reallocate. Adding NET_IP_ALIGN also ensures that both
+				 * PPPoATM and PPPoEoBR2684 packets end up aligned. */
+				skb = netdev_alloc_skb_ip_align(NULL, size + 1);
 				if (!skb) {
 					if (net_ratelimit())
 						dev_warn(&card->dev->dev, "Failed to allocate sk_buff for RX\n");
@@ -869,7 +874,10 @@ static void solos_bh(unsigned long card_arg)
 		/* Allocate RX skbs for any ports which need them */
 		if (card->using_dma && card->atmdev[port] &&
 		    !card->rx_skb[port]) {
-			struct sk_buff *skb = alloc_skb(RX_DMA_SIZE, GFP_ATOMIC);
+			/* Unlike the MMIO case (qv) we can't add NET_IP_ALIGN
+			 * here; the FPGA can only DMA to addresses which are
+			 * aligned to 4 bytes. */
+			struct sk_buff *skb = dev_alloc_skb(RX_DMA_SIZE);
 			if (skb) {
 				SKB_CB(skb)->dma_addr =
 					dma_map_single(&card->dev->dev, skb->data,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index 8353a6cbfcc2..03ed00c49823 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -157,6 +157,11 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x5090),	/* Custom T540-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x5091),	/* Custom T522-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x5092),	/* Custom T520-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5093),	/* Custom T580-LP-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5094),	/* Custom T540-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5095),	/* Custom T540-CR-SO */
+	CH_PCI_ID_TABLE_FENTRY(0x5096),	/* Custom T580-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x5097),	/* Custom T520-KR */
 
 	/* T6 adapters:
 	 */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 4402a1e48c9b..0ce6ffe73ca8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1268,8 +1268,6 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 		rss_context->hash_fn = MLX4_RSS_HASH_TOP;
 		memcpy(rss_context->rss_key, priv->rss_key,
 		       MLX4_EN_RSS_KEY_SIZE);
-		netdev_rss_key_fill(rss_context->rss_key,
-				    MLX4_EN_RSS_KEY_SIZE);
 	} else {
 		en_err(priv, "Unknown RSS hash function requested\n");
 		err = -EINVAL;
diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c
index 66d4ab703f45..60f43ec22175 100644
--- a/drivers/net/ethernet/micrel/ks8851.c
+++ b/drivers/net/ethernet/micrel/ks8851.c
@@ -1601,6 +1601,7 @@ static const struct of_device_id ks8851_match_table[] = {
 	{ .compatible = "micrel,ks8851" },
 	{ }
 };
+MODULE_DEVICE_TABLE(of, ks8851_match_table);
 
 static struct spi_driver ks8851_driver = {
 	.driver = {
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index e7094fbd7568..488c6f50df73 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -193,7 +193,8 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 		.flowi4_oif = vrf_dev->ifindex,
 		.flowi4_iif = LOOPBACK_IFINDEX,
 		.flowi4_tos = RT_TOS(ip4h->tos),
-		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC,
+		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC |
+				FLOWI_FLAG_SKIP_NH_OIF,
 		.daddr = ip4h->daddr,
 	};
 
diff --git a/include/net/flow.h b/include/net/flow.h
index acd6a096250e..9b85db85f13c 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -35,6 +35,7 @@ struct flowi_common {
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_KNOWN_NH		0x02
 #define FLOWI_FLAG_VRFSRC		0x04
+#define FLOWI_FLAG_SKIP_NH_OIF		0x08
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
 };
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 063d30474cf6..aaf9700fc9e5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -275,7 +275,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	     struct nl_info *info, struct mx6_config *mxc);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info);
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+		     unsigned int flags);
 
 void fib6_run_gc(unsigned long expires, struct net *net, bool force);
 
diff --git a/include/net/route.h b/include/net/route.h
index cc61cb95f059..f46af256880c 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -255,7 +255,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	if (netif_index_is_vrf(sock_net(sk), oif))
-		flow_flags |= FLOWI_FLAG_VRFSRC;
+		flow_flags |= FLOWI_FLAG_VRFSRC | FLOWI_FLAG_SKIP_NH_OIF;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
 			   protocol, flow_flags, dst, src, dport, sport);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 17e55dfecbe2..e07f551a863c 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh)
 
 static int clip_encap(struct atm_vcc *vcc, int mode)
 {
+	if (!CLIP_VCC(vcc))
+		return -EBADFD;
+
 	CLIP_VCC(vcc)->encap = mode;
 	return 0;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 26d6ffb6d23c..6c2af797f2f9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1426,7 +1426,7 @@ found:
 			    nh->nh_flags & RTNH_F_LINKDOWN &&
 			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 				continue;
-			if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
 				if (flp->flowi4_oif &&
 				    flp->flowi4_oif != nh->nh_oif)
 					continue;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c0a15e7f359f..f7d1d5e19e95 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (netif_index_is_vrf(net, ipc.oif)) {
 			flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
 					   RT_SCOPE_UNIVERSE, sk->sk_protocol,
-					   (flow_flags | FLOWI_FLAG_VRFSRC),
+					   (flow_flags | FLOWI_FLAG_VRFSRC |
+					    FLOWI_FLAG_SKIP_NH_OIF),
 					   faddr, saddr, dport,
 					   inet->inet_sport);
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index bb919b28619f..c10a9ee68433 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
+	fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
+
 	rt = __ip_route_output_key(net, fl4);
 	if (!IS_ERR(rt))
 		return &rt->dst;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 8a9ec01f4d01..7d2e0023c72d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -851,7 +851,7 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info);
+		inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
 
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -877,7 +877,7 @@ add:
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
-		inet6_rt_notify(RTM_NEWROUTE, rt, info);
+		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
@@ -1422,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	fib6_purge_rt(rt, fn, net);
 
-	inet6_rt_notify(RTM_DELROUTE, rt, info);
+	inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
 }
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 26ea47930740..92b1aa38f121 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -586,20 +586,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 				    &ipv6_hdr(skb)->saddr);
 
+	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
 		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
-		    skb_cloned(skb))
+		    skb_cloned(skb) ||
+		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 			goto slow_path;
 
 		skb_walk_frags(skb, frag) {
 			/* Correct geometry. */
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
-			    skb_headroom(frag) < hlen)
+			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 				goto slow_path_clean;
 
 			/* Partially cloned skb? */
@@ -616,8 +618,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 
 		err = 0;
 		offset = 0;
-		frag = skb_shinfo(skb)->frag_list;
-		skb_frag_list_init(skb);
 		/* BUILD HEADER */
 
 		*prevhdr = NEXTHDR_FRAGMENT;
@@ -625,8 +625,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 		if (!tmp_hdr) {
 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_FRAGFAILS);
-			return -ENOMEM;
+			err = -ENOMEM;
+			goto fail;
 		}
+		frag = skb_shinfo(skb)->frag_list;
+		skb_frag_list_init(skb);
 
 		__skb_pull(skb, hlen);
 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
@@ -723,7 +726,6 @@ slow_path:
 	 */
 
 	*prevhdr = NEXTHDR_FRAGMENT;
-	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	troom = rt->dst.dev->needed_tailroom;
 
 	/*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3d3c1b294725..d5fa50297f80 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3304,7 +3304,8 @@ errout:
 	return err;
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+		     unsigned int nlm_flags)
 {
 	struct sk_buff *skb;
 	struct net *net = info->nl_net;
@@ -3319,7 +3320,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
 		goto errout;
 
 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, 0, 0, 0);
+				event, info->portid, seq, 0, 0, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e8e524ad8a01..002a755fa07e 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -275,13 +275,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 	case NFPROTO_IPV6: {
 		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
 		__be16 frag_off;
+		int ofs;
 
-		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
-					   &nexthdr, &frag_off);
-		if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				       &frag_off);
+		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
 			pr_debug("proto header not found\n");
 			return NF_ACCEPT;
 		}
+		protoff = ofs;
 		break;
 	}
 	default: