diff options
-rw-r--r-- | Documentation/networking/vrf.txt | 96 | ||||
-rw-r--r-- | Documentation/sysctl/net.txt | 16 | ||||
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | drivers/atm/solos-pci.c | 12 | ||||
-rw-r--r-- | drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 5 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/micrel/ks8851.c | 1 | ||||
-rw-r--r-- | drivers/net/vrf.c | 3 | ||||
-rw-r--r-- | include/net/flow.h | 1 | ||||
-rw-r--r-- | include/net/ip6_fib.h | 3 | ||||
-rw-r--r-- | include/net/route.h | 2 | ||||
-rw-r--r-- | net/atm/clip.c | 3 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 3 | ||||
-rw-r--r-- | net/ipv4/xfrm4_policy.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 6 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 14 | ||||
-rw-r--r-- | net/ipv6/route.c | 5 | ||||
-rw-r--r-- | net/openvswitch/conntrack.c | 8 |
19 files changed, 155 insertions, 30 deletions
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt new file mode 100644 index 000000000000..031ef4a63485 --- /dev/null +++ b/Documentation/networking/vrf.txt @@ -0,0 +1,96 @@ +Virtual Routing and Forwarding (VRF) +==================================== +The VRF device combined with ip rules provides the ability to create virtual +routing and forwarding domains (aka VRFs, VRF-lite to be specific) in the +Linux network stack. One use case is the multi-tenancy problem where each +tenant has their own unique routing tables and in the very least need +different default gateways. + +Processes can be "VRF aware" by binding a socket to the VRF device. Packets +through the socket then use the routing table associated with the VRF +device. An important feature of the VRF device implementation is that it +impacts only Layer 3 and above so L2 tools (e.g., LLDP) are not affected +(ie., they do not need to be run in each VRF). The design also allows +the use of higher priority ip rules (Policy Based Routing, PBR) to take +precedence over the VRF device rules directing specific traffic as desired. + +In addition, VRF devices allow VRFs to be nested within namespaces. For +example network namespaces provide separation of network interfaces at L1 +(Layer 1 separation), VLANs on the interfaces within a namespace provide +L2 separation and then VRF devices provide L3 separation. + +Design +------ +A VRF device is created with an associated route table. Network interfaces +are then enslaved to a VRF device: + + +-----------------------------+ + | vrf-blue | ===> route table 10 + +-----------------------------+ + | | | + +------+ +------+ +-------------+ + | eth1 | | eth2 | ... | bond1 | + +------+ +------+ +-------------+ + | | + +------+ +------+ + | eth8 | | eth9 | + +------+ +------+ + +Packets received on an enslaved device and are switched to the VRF device +using an rx_handler which gives the impression that packets flow through +the VRF device. Similarly on egress routing rules are used to send packets +to the VRF device driver before getting sent out the actual interface. This +allows tcpdump on a VRF device to capture all packets into and out of the +VRF as a whole.[1] Similiarly, netfilter [2] and tc rules can be applied +using the VRF device to specify rules that apply to the VRF domain as a whole. + +[1] Packets in the forwarded state do not flow through the device, so those + packets are not seen by tcpdump. Will revisit this limitation in a + future release. + +[2] Iptables on ingress is limited to NF_INET_PRE_ROUTING only with skb->dev + set to real ingress device and egress is limited to NF_INET_POST_ROUTING. + Will revisit this limitation in a future release. + + +Setup +----- +1. VRF device is created with an association to a FIB table. + e.g, ip link add vrf-blue type vrf table 10 + ip link set dev vrf-blue up + +2. Rules are added that send lookups to the associated FIB table when the + iif or oif is the VRF device. e.g., + ip ru add oif vrf-blue table 10 + ip ru add iif vrf-blue table 10 + + Set the default route for the table (and hence default route for the VRF). + e.g, ip route add table 10 prohibit default + +3. Enslave L3 interfaces to a VRF device. + e.g, ip link set dev eth1 master vrf-blue + + Local and connected routes for enslaved devices are automatically moved to + the table associated with VRF device. Any additional routes depending on + the enslaved device will need to be reinserted following the enslavement. + +4. Additional VRF routes are added to associated table. + e.g., ip route add table 10 ... + + +Applications +------------ +Applications that are to work within a VRF need to bind their socket to the +VRF device: + + setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, dev, strlen(dev)+1); + +or to specify the output device using cmsg and IP_PKTINFO. + + +Limitations +----------- +VRF device currently only works for IPv4. Support for IPv6 is under development. + +Index of original ingress interface is not available via cmsg. Will address +soon. diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 6294b5186ae5..809ab6efcc74 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -54,13 +54,15 @@ default_qdisc -------------- The default queuing discipline to use for network devices. This allows -overriding the default queue discipline of pfifo_fast with an -alternative. Since the default queuing discipline is created with the -no additional parameters so is best suited to queuing disciplines that -work well without configuration like stochastic fair queue (sfq), -CoDel (codel) or fair queue CoDel (fq_codel). Don't use queuing disciplines -like Hierarchical Token Bucket or Deficit Round Robin which require setting -up classes and bandwidths. +overriding the default of pfifo_fast with an alternative. Since the default +queuing discipline is created without additional parameters so is best suited +to queuing disciplines that work well without configuration like stochastic +fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use +queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin +which require setting up classes and bandwidths. Note that physical multiqueue +interfaces still use mq as root qdisc, which in turn uses this default for its +leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead +default to noqueue. Default: pfifo_fast busy_read diff --git a/MAINTAINERS b/MAINTAINERS index 310da4295c70..d4d9d4f6d271 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11243,6 +11243,7 @@ L: [email protected] S: Maintained F: drivers/net/vrf.c F: include/net/vrf.h +F: Documentation/networking/vrf.txt VT1211 HARDWARE MONITOR DRIVER M: Juerg Haefliger <[email protected]> diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index 74e18b0a6d89..3d7fb6516f74 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -805,7 +805,12 @@ static void solos_bh(unsigned long card_arg) continue; } - skb = alloc_skb(size + 1, GFP_ATOMIC); + /* Use netdev_alloc_skb() because it adds NET_SKB_PAD of + * headroom, and ensures we can route packets back out an + * Ethernet interface (for example) without having to + * reallocate. Adding NET_IP_ALIGN also ensures that both + * PPPoATM and PPPoEoBR2684 packets end up aligned. */ + skb = netdev_alloc_skb_ip_align(NULL, size + 1); if (!skb) { if (net_ratelimit()) dev_warn(&card->dev->dev, "Failed to allocate sk_buff for RX\n"); @@ -869,7 +874,10 @@ static void solos_bh(unsigned long card_arg) /* Allocate RX skbs for any ports which need them */ if (card->using_dma && card->atmdev[port] && !card->rx_skb[port]) { - struct sk_buff *skb = alloc_skb(RX_DMA_SIZE, GFP_ATOMIC); + /* Unlike the MMIO case (qv) we can't add NET_IP_ALIGN + * here; the FPGA can only DMA to addresses which are + * aligned to 4 bytes. */ + struct sk_buff *skb = dev_alloc_skb(RX_DMA_SIZE); if (skb) { SKB_CB(skb)->dma_addr = dma_map_single(&card->dev->dev, skb->data, diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 8353a6cbfcc2..03ed00c49823 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -157,6 +157,11 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x5090), /* Custom T540-CR */ CH_PCI_ID_TABLE_FENTRY(0x5091), /* Custom T522-CR */ CH_PCI_ID_TABLE_FENTRY(0x5092), /* Custom T520-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5093), /* Custom T580-LP-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5094), /* Custom T540-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5095), /* Custom T540-CR-SO */ + CH_PCI_ID_TABLE_FENTRY(0x5096), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x5097), /* Custom T520-KR */ /* T6 adapters: */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 4402a1e48c9b..0ce6ffe73ca8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1268,8 +1268,6 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) rss_context->hash_fn = MLX4_RSS_HASH_TOP; memcpy(rss_context->rss_key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE); - netdev_rss_key_fill(rss_context->rss_key, - MLX4_EN_RSS_KEY_SIZE); } else { en_err(priv, "Unknown RSS hash function requested\n"); err = -EINVAL; diff --git a/drivers/net/ethernet/micrel/ks8851.c b/drivers/net/ethernet/micrel/ks8851.c index 66d4ab703f45..60f43ec22175 100644 --- a/drivers/net/ethernet/micrel/ks8851.c +++ b/drivers/net/ethernet/micrel/ks8851.c @@ -1601,6 +1601,7 @@ static const struct of_device_id ks8851_match_table[] = { { .compatible = "micrel,ks8851" }, { } }; +MODULE_DEVICE_TABLE(of, ks8851_match_table); static struct spi_driver ks8851_driver = { .driver = { diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e7094fbd7568..488c6f50df73 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -193,7 +193,8 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_oif = vrf_dev->ifindex, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC, + .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF, .daddr = ip4h->daddr, }; diff --git a/include/net/flow.h b/include/net/flow.h index acd6a096250e..9b85db85f13c 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -35,6 +35,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_VRFSRC 0x04 +#define FLOWI_FLAG_SKIP_NH_OIF 0x08 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 063d30474cf6..aaf9700fc9e5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -275,7 +275,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc); int fib6_del(struct rt6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info); +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); diff --git a/include/net/route.h b/include/net/route.h index cc61cb95f059..f46af256880c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -255,7 +255,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; if (netif_index_is_vrf(sock_net(sk), oif)) - flow_flags |= FLOWI_FLAG_VRFSRC; + flow_flags |= FLOWI_FLAG_VRFSRC | FLOWI_FLAG_SKIP_NH_OIF; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55dfecbe2..e07f551a863c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb6d23c..6c2af797f2f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..f7d1d5e19e95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_vrf(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..c10a9ee68433 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8a9ec01f4d01..7d2e0023c72d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -851,7 +851,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -877,7 +877,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -1422,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea47930740..92b1aa38f121 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -586,20 +586,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -616,8 +618,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +625,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -723,7 +726,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3d3c1b294725..d5fa50297f80 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3304,7 +3304,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3319,7 +3320,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e8e524ad8a01..002a755fa07e 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -275,13 +275,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; + int ofs; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } + protoff = ofs; break; } default: |