aboutsummaryrefslogtreecommitdiff
path: root/drivers/net/vrf.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/vrf.c')
-rw-r--r--drivers/net/vrf.c277
1 files changed, 226 insertions, 51 deletions
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 454f907d419a..8a1eaf3c302a 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -36,12 +36,14 @@
#include <net/addrconf.h>
#include <net/l3mdev.h>
#include <net/fib_rules.h>
+#include <net/netns/generic.h>
#define DRV_NAME "vrf"
#define DRV_VERSION "1.0"
#define FIB_RULE_PREF 1000 /* default preference for FIB rules */
-static bool add_fib_rules = true;
+
+static unsigned int vrf_net_id;
struct net_vrf {
struct rtable __rcu *rth;
@@ -77,8 +79,8 @@ static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
kfree_skb(skb);
}
-static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
+static void vrf_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
{
int i;
@@ -102,7 +104,23 @@ static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev,
stats->rx_bytes += rbytes;
stats->rx_packets += rpkts;
}
- return stats;
+}
+
+/* by default VRF devices do not have a qdisc and are expected
+ * to be created with only a single queue.
+ */
+static bool qdisc_tx_is_default(const struct net_device *dev)
+{
+ struct netdev_queue *txq;
+ struct Qdisc *qdisc;
+
+ if (dev->num_tx_queues > 1)
+ return false;
+
+ txq = netdev_get_tx_queue(dev, 0);
+ qdisc = rcu_access_pointer(txq->qdisc);
+
+ return !qdisc->enqueue;
}
/* Local traffic destined to local address. Reinsert the packet to rx
@@ -341,6 +359,7 @@ static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ int len = skb->len;
netdev_tx_t ret = is_ip_tx_frame(skb, dev);
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
@@ -348,7 +367,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
u64_stats_update_begin(&dstats->syncp);
dstats->tx_pkts++;
- dstats->tx_bytes += skb->len;
+ dstats->tx_bytes += len;
u64_stats_update_end(&dstats->syncp);
} else {
this_cpu_inc(dev->dstats->tx_drps);
@@ -357,6 +376,29 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
return ret;
}
+static int vrf_finish_direct(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net_device *vrf_dev = skb->dev;
+
+ if (!list_empty(&vrf_dev->ptype_all) &&
+ likely(skb_headroom(skb) >= ETH_HLEN)) {
+ struct ethhdr *eth = skb_push(skb, ETH_HLEN);
+
+ ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
+ eth_zero_addr(eth->h_dest);
+ eth->h_proto = skb->protocol;
+
+ rcu_read_lock_bh();
+ dev_queue_xmit_nit(skb, vrf_dev);
+ rcu_read_unlock_bh();
+
+ skb_pull(skb, ETH_HLEN);
+ }
+
+ return 1;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
/* modelled after ip6_finish_output2 */
static int vrf_finish_output6(struct net *net, struct sock *sk,
@@ -379,7 +421,8 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
if (!IS_ERR(neigh)) {
- ret = dst_neigh_output(dst, neigh, skb);
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb);
rcu_read_unlock_bh();
return ret;
}
@@ -404,18 +447,13 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
* packet to go through device based features such as qdisc, netfilter
* hooks and packet sockets with skb->dev set to vrf device.
*/
-static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
- struct sock *sk,
- struct sk_buff *skb)
+static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
+ struct sk_buff *skb)
{
struct net_vrf *vrf = netdev_priv(vrf_dev);
struct dst_entry *dst = NULL;
struct rt6_info *rt6;
- /* don't divert link scope packets */
- if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
- return skb;
-
rcu_read_lock();
rt6 = rcu_dereference(vrf->rt6);
@@ -437,6 +475,55 @@ static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
return skb;
}
+static int vrf_output6_direct(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ skb->protocol = htons(ETH_P_IPV6);
+
+ return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ vrf_finish_direct,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
+ struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net *net = dev_net(vrf_dev);
+ int err;
+
+ skb->dev = vrf_dev;
+
+ err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
+ skb, NULL, vrf_dev, vrf_output6_direct);
+
+ if (likely(err == 1))
+ err = vrf_output6_direct(net, sk, skb);
+
+ /* reset skb device */
+ if (likely(err == 1))
+ nf_reset(skb);
+ else
+ skb = NULL;
+
+ return skb;
+}
+
+static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
+ struct sock *sk,
+ struct sk_buff *skb)
+{
+ /* don't divert link scope packets */
+ if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
+ return skb;
+
+ if (qdisc_tx_is_default(vrf_dev))
+ return vrf_ip6_out_direct(vrf_dev, sk, skb);
+
+ return vrf_ip6_out_redirect(vrf_dev, skb);
+}
+
/* holding rtnl */
static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
{
@@ -461,8 +548,10 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
}
if (rt6_local) {
- if (rt6_local->rt6i_idev)
+ if (rt6_local->rt6i_idev) {
in6_dev_put(rt6_local->rt6i_idev);
+ rt6_local->rt6i_idev = NULL;
+ }
dst = &rt6_local->dst;
dev_put(dst->dev);
@@ -474,7 +563,7 @@ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
static int vrf_rt6_create(struct net_device *dev)
{
- int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE;
+ int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
struct net_vrf *vrf = netdev_priv(dev);
struct net *net = dev_net(dev);
struct fib6_table *rt6i_table;
@@ -494,8 +583,6 @@ static int vrf_rt6_create(struct net_device *dev)
if (!rt6)
goto out;
- dst_hold(&rt6->dst);
-
rt6->rt6i_table = rt6i_table;
rt6->dst.output = vrf_output6;
@@ -508,8 +595,6 @@ static int vrf_rt6_create(struct net_device *dev)
goto out;
}
- dst_hold(&rt6_local->dst);
-
rt6_local->rt6i_idev = in6_dev_get(dev);
rt6_local->rt6i_flags = RTF_UP | RTF_NONEXTHOP | RTF_LOCAL;
rt6_local->rt6i_table = rt6i_table;
@@ -575,8 +660,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
- if (!IS_ERR(neigh))
- ret = dst_neigh_output(dst, neigh, skb);
+ if (!IS_ERR(neigh)) {
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb);
+ }
rcu_read_unlock_bh();
err:
@@ -604,18 +691,13 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
* packet to go through device based features such as qdisc, netfilter
* hooks and packet sockets with skb->dev set to vrf device.
*/
-static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
- struct sock *sk,
- struct sk_buff *skb)
+static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
+ struct sk_buff *skb)
{
struct net_vrf *vrf = netdev_priv(vrf_dev);
struct dst_entry *dst = NULL;
struct rtable *rth;
- /* don't divert multicast */
- if (ipv4_is_multicast(ip_hdr(skb)->daddr))
- return skb;
-
rcu_read_lock();
rth = rcu_dereference(vrf->rth);
@@ -637,6 +719,55 @@ static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
return skb;
}
+static int vrf_output_direct(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ skb->protocol = htons(ETH_P_IP);
+
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, NULL, skb->dev,
+ vrf_finish_direct,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+}
+
+static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
+ struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct net *net = dev_net(vrf_dev);
+ int err;
+
+ skb->dev = vrf_dev;
+
+ err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
+ skb, NULL, vrf_dev, vrf_output_direct);
+
+ if (likely(err == 1))
+ err = vrf_output_direct(net, sk, skb);
+
+ /* reset skb device */
+ if (likely(err == 1))
+ nf_reset(skb);
+ else
+ skb = NULL;
+
+ return skb;
+}
+
+static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
+ struct sock *sk,
+ struct sk_buff *skb)
+{
+ /* don't divert multicast */
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr))
+ return skb;
+
+ if (qdisc_tx_is_default(vrf_dev))
+ return vrf_ip_out_direct(vrf_dev, sk, skb);
+
+ return vrf_ip_out_redirect(vrf_dev, skb);
+}
+
/* called with rcu lock held */
static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
struct sock *sk,
@@ -744,14 +875,24 @@ static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
{
int ret;
+ /* do not allow loopback device to be enslaved to a VRF.
+ * The vrf device acts as the loopback for the vrf.
+ */
+ if (port_dev == dev_net(dev)->loopback_dev)
+ return -EOPNOTSUPP;
+
+ port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL);
if (ret < 0)
- return ret;
+ goto err;
- port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
cycle_netdev(port_dev);
return 0;
+
+err:
+ port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
+ return ret;
}
static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev)
@@ -781,15 +922,10 @@ static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
static void vrf_dev_uninit(struct net_device *dev)
{
struct net_vrf *vrf = netdev_priv(dev);
- struct net_device *port_dev;
- struct list_head *iter;
vrf_rtable_release(dev, vrf);
vrf_rt6_release(dev, vrf);
- netdev_for_each_lower_dev(dev, port_dev, iter)
- vrf_del_slave(dev, port_dev);
-
free_percpu(dev->dstats);
dev->dstats = NULL;
}
@@ -846,6 +982,7 @@ static u32 vrf_fib_table(const struct net_device *dev)
static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ kfree_skb(skb);
return 0;
}
@@ -855,7 +992,7 @@ static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
{
struct net *net = dev_net(dev);
- if (NF_HOOK(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) < 0)
+ if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
skb = NULL; /* kfree_skb(skb) handled by nf code */
return skb;
@@ -973,9 +1110,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
- skb_push(skb, skb->mac_len);
- dev_queue_xmit_nit(skb, vrf_dev);
- skb_pull(skb, skb->mac_len);
+ if (!list_empty(&vrf_dev->ptype_all)) {
+ skb_push(skb, skb->mac_len);
+ dev_queue_xmit_nit(skb, vrf_dev);
+ skb_pull(skb, skb->mac_len);
+ }
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
}
@@ -1016,9 +1155,11 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
vrf_rx_stats(vrf_dev, skb->len);
- skb_push(skb, skb->mac_len);
- dev_queue_xmit_nit(skb, vrf_dev);
- skb_pull(skb, skb->mac_len);
+ if (!list_empty(&vrf_dev->ptype_all)) {
+ skb_push(skb, skb->mac_len);
+ dev_queue_xmit_nit(skb, vrf_dev);
+ skb_pull(skb, skb->mac_len);
+ }
skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
out:
@@ -1123,7 +1264,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
goto nla_put_failure;
/* rule only needs to appear once */
- nlh->nlmsg_flags &= NLM_F_EXCL;
+ nlh->nlmsg_flags |= NLM_F_EXCL;
frh = nlmsg_data(nlh);
memset(frh, 0, sizeof(*frh));
@@ -1141,11 +1282,11 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
/* fib_nl_{new,del}rule handling looks for net from skb->sk */
skb->sk = dev_net(dev)->rtnl;
if (add_it) {
- err = fib_nl_newrule(skb, nlh);
+ err = fib_nl_newrule(skb, nlh, NULL);
if (err == -EEXIST)
err = 0;
} else {
- err = fib_nl_delrule(skb, nlh);
+ err = fib_nl_delrule(skb, nlh, NULL);
if (err == -ENOENT)
err = 0;
}
@@ -1200,7 +1341,7 @@ static void vrf_setup(struct net_device *dev)
dev->netdev_ops = &vrf_netdev_ops;
dev->l3mdev_ops = &vrf_l3mdev_ops;
dev->ethtool_ops = &vrf_ethtool_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
/* Fill in device structure with ethernet-generic values. */
eth_hw_addr_random(dev);
@@ -1226,7 +1367,8 @@ static void vrf_setup(struct net_device *dev)
dev->priv_flags |= IFF_NO_QUEUE;
}
-static int vrf_validate(struct nlattr *tb[], struct nlattr *data[])
+static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
@@ -1239,13 +1381,22 @@ static int vrf_validate(struct nlattr *tb[], struct nlattr *data[])
static void vrf_dellink(struct net_device *dev, struct list_head *head)
{
+ struct net_device *port_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(dev, port_dev, iter)
+ vrf_del_slave(dev, port_dev);
+
unregister_netdevice_queue(dev, head);
}
static int vrf_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct net_vrf *vrf = netdev_priv(dev);
+ bool *add_fib_rules;
+ struct net *net;
int err;
if (!data || !data[IFLA_VRF_TABLE])
@@ -1261,13 +1412,15 @@ static int vrf_newlink(struct net *src_net, struct net_device *dev,
if (err)
goto out;
- if (add_fib_rules) {
+ net = dev_net(dev);
+ add_fib_rules = net_generic(net, vrf_net_id);
+ if (*add_fib_rules) {
err = vrf_add_fib_rules(dev);
if (err) {
unregister_netdevice(dev);
goto out;
}
- add_fib_rules = false;
+ *add_fib_rules = false;
}
out:
@@ -1350,16 +1503,38 @@ static struct notifier_block vrf_notifier_block __read_mostly = {
.notifier_call = vrf_device_event,
};
+/* Initialize per network namespace state */
+static int __net_init vrf_netns_init(struct net *net)
+{
+ bool *add_fib_rules = net_generic(net, vrf_net_id);
+
+ *add_fib_rules = true;
+
+ return 0;
+}
+
+static struct pernet_operations vrf_net_ops __net_initdata = {
+ .init = vrf_netns_init,
+ .id = &vrf_net_id,
+ .size = sizeof(bool),
+};
+
static int __init vrf_init_module(void)
{
int rc;
register_netdevice_notifier(&vrf_notifier_block);
- rc = rtnl_link_register(&vrf_link_ops);
+ rc = register_pernet_subsys(&vrf_net_ops);
if (rc < 0)
goto error;
+ rc = rtnl_link_register(&vrf_link_ops);
+ if (rc < 0) {
+ unregister_pernet_subsys(&vrf_net_ops);
+ goto error;
+ }
+
return 0;
error: