From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:46 +0200 Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls Provides infrastructure to parse/dump/store encap information for light weight tunnels like mpls. Encap information for such tunnels is associated with fib routes. This infrastructure is based on previous suggestions from Eric Biederman to follow the xfrm infrastructure. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/lwtunnel.h | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 include/linux/lwtunnel.h (limited to 'include/linux') diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 000000000000..97f32f8b4ae1 --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_LWTUNNEL_H_ +#define _LINUX_LWTUNNEL_H_ + +#include + +#endif /* _LINUX_LWTUNNEL_H_ */ -- cgit From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:53 +0200 Subject: mpls: ip tunnel support This implementation uses lwtunnel infrastructure to register hooks for mpls tunnel encaps. It picks cues from iptunnel_encaps infrastructure and previous mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/mpls_iptunnel.h | 6 + include/net/mpls_iptunnel.h | 29 +++++ include/uapi/linux/mpls_iptunnel.h | 28 +++++ net/mpls/Kconfig | 8 +- net/mpls/Makefile | 1 + net/mpls/mpls_iptunnel.c | 233 +++++++++++++++++++++++++++++++++++++ 6 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 include/linux/mpls_iptunnel.h create mode 100644 include/net/mpls_iptunnel.h create mode 100644 include/uapi/linux/mpls_iptunnel.h create mode 100644 net/mpls/mpls_iptunnel.c (limited to 'include/linux') diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..ef29eb2d6dfd --- /dev/null +++ b/include/linux/mpls_iptunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_MPLS_IPTUNNEL_H +#define _LINUX_MPLS_IPTUNNEL_H + +#include + +#endif /* _LINUX_MPLS_IPTUNNEL_H */ diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h new file mode 100644 index 000000000000..4757997f76ed --- /dev/null +++ b/include/net/mpls_iptunnel.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 Cumulus Networks, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_IPTUNNEL_H +#define _NET_MPLS_IPTUNNEL_H 1 + +#define MAX_NEW_LABELS 2 + +struct mpls_iptunnel_encap { + u32 label[MAX_NEW_LABELS]; + u32 labels; +}; + +static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) +{ + return (struct mpls_iptunnel_encap *)lwtstate->data; +} + +#endif diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..d80a0498f77e --- /dev/null +++ b/include/uapi/linux/mpls_iptunnel.h @@ -0,0 +1,28 @@ +/* + * mpls tunnel api + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H +#define _UAPI_LINUX_MPLS_IPTUNNEL_H + +/* MPLS tunnel attributes + * [RTA_ENCAP] = { + * [MPLS_IPTUNNEL_DST] + * } + */ +enum { + MPLS_IPTUNNEL_UNSPEC, + MPLS_IPTUNNEL_DST, + __MPLS_IPTUNNEL_MAX, +}; +#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) + +#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */ diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 17bde799c854..5c467ef97311 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 65bbe68c72e6..9ca923625016 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c new file mode 100644 index 000000000000..eea096f21ba5 --- /dev/null +++ b/net/mpls/mpls_iptunnel.c @@ -0,0 +1,233 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + struct lwtunnel_state *lwtstate = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (skb->protocol == htons(ETH_P_IP)) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + lwtstate = rt->rt_lwtstate; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + lwtstate = rt6->rt6i_lwtstate; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = rcu_dereference(dst->dev); + if (!mpls_output_possible(out_dev) || + !lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); -- cgit From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:58 +0200 Subject: vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 149 ++++++++++++++++++++++++++++++++++++------- include/linux/skbuff.h | 1 + include/net/dst_metadata.h | 13 ++++ include/net/ip_tunnels.h | 14 ++++ include/net/vxlan.h | 10 ++- include/uapi/linux/if_link.h | 1 + 6 files changed, 165 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ec86a11743fd..06c092b05a51 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -49,6 +49,7 @@ #include #include #endif +#include #define VXLAN_VERSION "0.1" @@ -140,6 +141,11 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) +{ + return vs->flags & VXLAN_F_COLLECT_METADATA; +} + #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) @@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { + struct metadata_dst *tun_dst = NULL; + struct ip_tunnel_info *info; struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; - struct vxlan_metadata md = {0}; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vni &= VXLAN_VNI_MASK; } + if (vxlan_collect_metadata(vs)) { + const struct iphdr *iph = ip_hdr(skb); + + tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC); + if (!tun_dst) + goto drop; + + info = &tun_dst->u.tun_info; + info->key.ipv4_src = iph->saddr; + info->key.ipv4_dst = iph->daddr; + info->key.ipv4_tos = iph->tos; + info->key.ipv4_ttl = iph->ttl; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = TUNNEL_KEY; + info->key.tun_id = cpu_to_be64(vni >> 8); + if (udp_hdr(skb)->check != 0) + info->key.tun_flags |= TUNNEL_CSUM; + + md = ip_tunnel_info_opts(info, sizeof(*md)); + md->tun_dst = tun_dst; + } else { + memset(md, 0, sizeof(*md)); + } + /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ @@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; - md.gbp = ntohs(gbp->policy_id); + md->gbp = ntohs(gbp->policy_id); + + if (tun_dst) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) - md.gbp |= VXLAN_GBP_DONT_LEARN; + md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) - md.gbp |= VXLAN_GBP_POLICY_APPLIED; + md->gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } @@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md.vni = vxh->vx_vni; - vs->rcv(vs, skb, &md); + md->vni = vxh->vx_vni; + vs->rcv(vs, skb, md); return 0; drop: @@ -1247,6 +1286,9 @@ bad_flags: ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + /* Return non vxlan pkt */ return 1; } @@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, int err = 0; union vxlan_addr *remote_ip; - vni = ntohl(md->vni) >> 8; + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_FLOW_BASED) + vni = 0; + else + vni = ntohl(md->vni) >> 8; + /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) @@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, #endif } + if (md->tun_dst) { + skb_dst_set(skb, (struct dst_entry *)md->tun_dst); + md->tun_dst = NULL; + } + if ((vxlan->flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) goto drop; skb_reset_network_header(skb); - skb->mark = md->gbp; + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_FLOW_BASED)) + skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); @@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, return; drop: + if (md->tun_dst) + dst_release((struct dst_entry *)md->tun_dst); + /* Consume bad packet */ kfree_skb(skb); } @@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct ip_tunnel_info *info = skb_tunnel_info(skb); struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; - struct vxlan_metadata md; + union vxlan_addr remote_ip; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; int err; + u32 flags = vxlan->flags; - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; - vni = rdst->remote_vni; - dst = &rdst->remote_ip; + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; + } else { + if (!info) { + WARN_ONCE(1, "%s: Missing encapsulation instructions\n", + dev->name); + goto drop; + } + + dst_port = info->key.tp_dst ? : vxlan->dst_port; + vni = be64_to_cpu(info->key.tun_id); + remote_ip.sin.sin_family = AF_INET; + remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; + dst = &remote_ip; + } if (vxlan_addr_any(dst)) { if (did_rsc) { @@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->port_max, true); if (dst->sa.sa_family == AF_INET) { + if (info) { + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + + ttl = info->key.ipv4_ttl; + tos = info->key.ipv4_tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info, sizeof(*md)); + } else { + md->gbp = skb->mark; + } + memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; @@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; - + md->vni = htonl(vni << 8); err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, &md, + src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = rdst->remote_ifindex; + fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; fl6.saddr = vxlan->saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; @@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; + md->vni = htonl(vni << 8); + md->gbp = skb->mark; err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, &md, + 0, ttl, src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), vxlan->flags); #endif @@ -2051,6 +2141,7 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; @@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } + if (vxlan->flags & VXLAN_F_FLOW_BASED && + info && info->mode == IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } + f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; @@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_FLOWBASED] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, @@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_LIMIT]) vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + if (data[IFLA_VXLAN_FLOWBASED] && + nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) + vxlan->flags |= VXLAN_F_FLOW_BASED; + if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); @@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_FLOWBASED */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + @@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->flags & VXLAN_F_L3MISS)) || + nla_put_u8(skb, IFLA_VXLAN_FLOWBASED, + !!(vxlan->flags & VXLAN_F_FLOW_BASED)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6bd96fe9416a..648a2c241993 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4f7694f3c7d0..e843937fb30a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -8,6 +8,9 @@ struct metadata_dst { struct dst_entry dst; size_t opts_len; + union { + struct ip_tunnel_info tun_info; + } u; }; static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) @@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + if (md_dst) + return &md_dst->u.tun_info; + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6b9d559ce5f5..d11530f1c1e2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -38,10 +38,19 @@ struct ip_tunnel_key { __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ +/* Indicates whether the tunnel info structure represents receive + * or transmit tunnel parameters. + */ +enum { + IP_TUNNEL_INFO_RX, + IP_TUNNEL_INFO_TX, +}; + struct ip_tunnel_info { struct ip_tunnel_key key; const void *options; u8 options_len; + u8 mode; }; /* 6rd prefix/relay information */ @@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err, } } +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +{ + return info + 1; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0082b5d33d7d..80a2da29e088 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<