aboutsummaryrefslogtreecommitdiff
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/actions.c316
-rw-r--r--net/openvswitch/conntrack.c409
-rw-r--r--net/openvswitch/conntrack.h14
-rw-r--r--net/openvswitch/datapath.c87
-rw-r--r--net/openvswitch/datapath.h4
-rw-r--r--net/openvswitch/flow.c44
-rw-r--r--net/openvswitch/flow.h55
-rw-r--r--net/openvswitch/flow_netlink.c245
-rw-r--r--net/openvswitch/flow_netlink.h7
-rw-r--r--net/openvswitch/vport-internal_dev.c10
-rw-r--r--net/openvswitch/vport-vxlan.c7
11 files changed, 837 insertions, 361 deletions
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 514f7bcf7c63..e4610676299b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -44,13 +44,10 @@
#include "conntrack.h"
#include "vport.h"
-static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
- struct sw_flow_key *key,
- const struct nlattr *attr, int len);
-
struct deferred_action {
struct sk_buff *skb;
const struct nlattr *actions;
+ int actions_len;
/* Store pkt_key clone when creating deferred action. */
struct sw_flow_key pkt_key;
@@ -82,14 +79,31 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
};
-struct recirc_keys {
+struct action_flow_keys {
struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
};
static struct action_fifo __percpu *action_fifos;
-static struct recirc_keys __percpu *recirc_keys;
+static struct action_flow_keys __percpu *flow_keys;
static DEFINE_PER_CPU(int, exec_actions_level);
+/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
+ * space. Return NULL if out of key spaces.
+ */
+static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
+{
+ struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
+ int level = this_cpu_read(exec_actions_level);
+ struct sw_flow_key *key = NULL;
+
+ if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+ key = &keys->key[level - 1];
+ *key = *key_;
+ }
+
+ return key;
+}
+
static void action_fifo_init(struct action_fifo *fifo)
{
fifo->head = 0;
@@ -119,8 +133,9 @@ static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
/* Return true if fifo is not full */
static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
- const struct sw_flow_key *key,
- const struct nlattr *attr)
+ const struct sw_flow_key *key,
+ const struct nlattr *actions,
+ const int actions_len)
{
struct action_fifo *fifo;
struct deferred_action *da;
@@ -129,7 +144,8 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
- da->actions = attr;
+ da->actions = actions;
+ da->actions_len = actions_len;
da->pkt_key = *key;
}
@@ -146,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key *key)
return !(key->mac_proto & SW_FLOW_KEY_INVALID);
}
+static int clone_execute(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ u32 recirc_id,
+ const struct nlattr *actions, int len,
+ bool last, bool clone_flow_key);
+
static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
__be16 ethertype)
{
@@ -796,9 +818,8 @@ static void ovs_fragment(struct net *net, struct vport *vport,
unsigned long orig_dst;
struct rt6_info ovs_rt;
- if (!v6ops) {
+ if (!v6ops)
goto err;
- }
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
@@ -909,72 +930,35 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
}
+/* When 'last' is true, sample() should always consume the 'skb'.
+ * Otherwise, sample() should keep 'skb' intact regardless what
+ * actions are executed within sample().
+ */
static int sample(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
- const struct nlattr *actions, int actions_len)
+ bool last)
{
- const struct nlattr *acts_list = NULL;
- const struct nlattr *a;
- int rem;
- u32 cutlen = 0;
-
- for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
- a = nla_next(a, &rem)) {
- u32 probability;
-
- switch (nla_type(a)) {
- case OVS_SAMPLE_ATTR_PROBABILITY:
- probability = nla_get_u32(a);
- if (!probability || prandom_u32() > probability)
- return 0;
- break;
-
- case OVS_SAMPLE_ATTR_ACTIONS:
- acts_list = a;
- break;
- }
- }
-
- rem = nla_len(acts_list);
- a = nla_data(acts_list);
-
- /* Actions list is empty, do nothing */
- if (unlikely(!rem))
+ struct nlattr *actions;
+ struct nlattr *sample_arg;
+ int rem = nla_len(attr);
+ const struct sample_arg *arg;
+ bool clone_flow_key;
+
+ /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
+ sample_arg = nla_data(attr);
+ arg = nla_data(sample_arg);
+ actions = nla_next(sample_arg, &rem);
+
+ if ((arg->probability != U32_MAX) &&
+ (!arg->probability || prandom_u32() > arg->probability)) {
+ if (last)
+ consume_skb(skb);
return 0;
-
- /* The only known usage of sample action is having a single user-space
- * action, or having a truncate action followed by a single user-space
- * action. Treat this usage as a special case.
- * The output_userspace() should clone the skb to be sent to the
- * user space. This skb will be consumed by its caller.
- */
- if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) {
- struct ovs_action_trunc *trunc = nla_data(a);
-
- if (skb->len > trunc->max_len)
- cutlen = skb->len - trunc->max_len;
-
- a = nla_next(a, &rem);
}
- if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE &&
- nla_is_last(a, rem)))
- return output_userspace(dp, skb, key, a, actions,
- actions_len, cutlen);
-
- skb = skb_clone(skb, GFP_ATOMIC);
- if (!skb)
- /* Skip the sample action when out of memory. */
- return 0;
-
- if (!add_deferred_actions(skb, key, a)) {
- if (net_ratelimit())
- pr_warn("%s: deferred actions limit reached, dropping sample action\n",
- ovs_dp_name(dp));
-
- kfree_skb(skb);
- }
- return 0;
+ clone_flow_key = !arg->exec;
+ return clone_execute(dp, skb, key, 0, actions, rem, last,
+ clone_flow_key);
}
static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
@@ -1074,6 +1058,8 @@ static int execute_masked_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_CT_ZONE:
case OVS_KEY_ATTR_CT_MARK:
case OVS_KEY_ATTR_CT_LABELS:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
+ case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
err = -EINVAL;
break;
}
@@ -1083,10 +1069,9 @@ static int execute_masked_set_action(struct sk_buff *skb,
static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
- const struct nlattr *a, int rem)
+ const struct nlattr *a, bool last)
{
- struct deferred_action *da;
- int level;
+ u32 recirc_id;
if (!is_flow_key_valid(key)) {
int err;
@@ -1097,43 +1082,8 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
}
BUG_ON(!is_flow_key_valid(key));
- if (!nla_is_last(a, rem)) {
- /* Recirc action is the not the last action
- * of the action list, need to clone the skb.
- */
- skb = skb_clone(skb, GFP_ATOMIC);
-
- /* Skip the recirc action when out of memory, but
- * continue on with the rest of the action list.
- */
- if (!skb)
- return 0;
- }
-
- level = this_cpu_read(exec_actions_level);
- if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
- struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
- struct sw_flow_key *recirc_key = &rks->key[level - 1];
-
- *recirc_key = *key;
- recirc_key->recirc_id = nla_get_u32(a);
- ovs_dp_process_packet(skb, recirc_key);
-
- return 0;
- }
-
- da = add_deferred_actions(skb, key, NULL);
- if (da) {
- da->pkt_key.recirc_id = nla_get_u32(a);
- } else {
- kfree_skb(skb);
-
- if (net_ratelimit())
- pr_warn("%s: deferred action limit reached, drop recirc action\n",
- ovs_dp_name(dp));
- }
-
- return 0;
+ recirc_id = nla_get_u32(a);
+ return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true);
}
/* Execute a list of actions against 'skb'. */
@@ -1141,12 +1091,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len)
{
- /* Every output action needs a separate clone of 'skb', but the common
- * case is just a single output action, so that doing a clone and
- * then freeing the original skbuff is wasteful. So the following code
- * is slightly obscure just to avoid that.
- */
- int prev_port = -1;
const struct nlattr *a;
int rem;
@@ -1154,20 +1098,28 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
a = nla_next(a, &rem)) {
int err = 0;
- if (unlikely(prev_port != -1)) {
- struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
-
- if (out_skb)
- do_output(dp, out_skb, prev_port, key);
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT: {
+ int port = nla_get_u32(a);
+ struct sk_buff *clone;
+
+ /* Every output action needs a separate clone
+ * of 'skb', In case the output action is the
+ * last action, cloning can be avoided.
+ */
+ if (nla_is_last(a, rem)) {
+ do_output(dp, skb, port, key);
+ /* 'skb' has been used for output.
+ */
+ return 0;
+ }
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone)
+ do_output(dp, clone, port, key);
OVS_CB(skb)->cutlen = 0;
- prev_port = -1;
- }
-
- switch (nla_type(a)) {
- case OVS_ACTION_ATTR_OUTPUT:
- prev_port = nla_get_u32(a);
break;
+ }
case OVS_ACTION_ATTR_TRUNC: {
struct ovs_action_trunc *trunc = nla_data(a);
@@ -1203,9 +1155,11 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
err = pop_vlan(skb, key);
break;
- case OVS_ACTION_ATTR_RECIRC:
- err = execute_recirc(dp, skb, key, a, rem);
- if (nla_is_last(a, rem)) {
+ case OVS_ACTION_ATTR_RECIRC: {
+ bool last = nla_is_last(a, rem);
+
+ err = execute_recirc(dp, skb, key, a, last);
+ if (last) {
/* If this is the last action, the skb has
* been consumed or freed.
* Return immediately.
@@ -1213,6 +1167,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
return err;
}
break;
+ }
case OVS_ACTION_ATTR_SET:
err = execute_set_action(skb, key, nla_data(a));
@@ -1223,9 +1178,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
err = execute_masked_set_action(skb, key, nla_data(a));
break;
- case OVS_ACTION_ATTR_SAMPLE:
- err = sample(dp, skb, key, a, attr, len);
+ case OVS_ACTION_ATTR_SAMPLE: {
+ bool last = nla_is_last(a, rem);
+
+ err = sample(dp, skb, key, a, last);
+ if (last)
+ return err;
+
break;
+ }
case OVS_ACTION_ATTR_CT:
if (!is_flow_key_valid(key)) {
@@ -1257,11 +1218,80 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
}
}
- if (prev_port != -1)
- do_output(dp, skb, prev_port, key);
- else
- consume_skb(skb);
+ consume_skb(skb);
+ return 0;
+}
+/* Execute the actions on the clone of the packet. The effect of the
+ * execution does not affect the original 'skb' nor the original 'key'.
+ *
+ * The execution may be deferred in case the actions can not be executed
+ * immediately.
+ */
+static int clone_execute(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, u32 recirc_id,
+ const struct nlattr *actions, int len,
+ bool last, bool clone_flow_key)
+{
+ struct deferred_action *da;
+ struct sw_flow_key *clone;
+
+ skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
+ if (!skb) {
+ /* Out of memory, skip this action.
+ */
+ return 0;
+ }
+
+ /* When clone_flow_key is false, the 'key' will not be change
+ * by the actions, then the 'key' can be used directly.
+ * Otherwise, try to clone key from the next recursion level of
+ * 'flow_keys'. If clone is successful, execute the actions
+ * without deferring.
+ */
+ clone = clone_flow_key ? clone_key(key) : key;
+ if (clone) {
+ int err = 0;
+
+ if (actions) { /* Sample action */
+ if (clone_flow_key)
+ __this_cpu_inc(exec_actions_level);
+
+ err = do_execute_actions(dp, skb, clone,
+ actions, len);
+
+ if (clone_flow_key)
+ __this_cpu_dec(exec_actions_level);
+ } else { /* Recirc action */
+ clone->recirc_id = recirc_id;
+ ovs_dp_process_packet(skb, clone);
+ }
+ return err;
+ }
+
+ /* Out of 'flow_keys' space. Defer actions */
+ da = add_deferred_actions(skb, key, actions, len);
+ if (da) {
+ if (!actions) { /* Recirc action */
+ key = &da->pkt_key;
+ key->recirc_id = recirc_id;
+ }
+ } else {
+ /* Out of per CPU action FIFO space. Drop the 'skb' and
+ * log an error.
+ */
+ kfree_skb(skb);
+
+ if (net_ratelimit()) {
+ if (actions) { /* Sample action */
+ pr_warn("%s: deferred action limit reached, drop sample action\n",
+ ovs_dp_name(dp));
+ } else { /* Recirc action */
+ pr_warn("%s: deferred action limit reached, drop recirc action\n",
+ ovs_dp_name(dp));
+ }
+ }
+ }
return 0;
}
@@ -1279,10 +1309,10 @@ static void process_deferred_actions(struct datapath *dp)
struct sk_buff *skb = da->skb;
struct sw_flow_key *key = &da->pkt_key;
const struct nlattr *actions = da->actions;
+ int actions_len = da->actions_len;
if (actions)
- do_execute_actions(dp, skb, key, actions,
- nla_len(actions));
+ do_execute_actions(dp, skb, key, actions, actions_len);
else
ovs_dp_process_packet(skb, key);
} while (!action_fifo_is_empty(fifo));
@@ -1324,8 +1354,8 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
- recirc_keys = alloc_percpu(struct recirc_keys);
- if (!recirc_keys) {
+ flow_keys = alloc_percpu(struct action_flow_keys);
+ if (!flow_keys) {
free_percpu(action_fifos);
return -ENOMEM;
}
@@ -1336,5 +1366,5 @@ int action_fifos_init(void)
void action_fifos_exit(void)
{
free_percpu(action_fifos);
- free_percpu(recirc_keys);
+ free_percpu(flow_keys);
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 54253ea5976e..e3c4c6c3fef7 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -65,7 +65,10 @@ struct ovs_conntrack_info {
struct nf_conn *ct;
u8 commit : 1;
u8 nat : 3; /* enum ovs_ct_nat */
+ u8 force : 1;
+ u8 have_eventmask : 1;
u16 family;
+ u32 eventmask; /* Mask of 1 << IPCT_*. */
struct md_mark mark;
struct md_labels labels;
#ifdef CONFIG_NF_NAT_NEEDED
@@ -73,6 +76,8 @@ struct ovs_conntrack_info {
#endif
};
+static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
+
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
static u16 key_to_nfproto(const struct sw_flow_key *key)
@@ -129,21 +134,33 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
#endif
}
+/* Guard against conntrack labels max size shrinking below 128 bits. */
+#if NF_CT_LABELS_MAX_SIZE < 16
+#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
+#endif
+
static void ovs_ct_get_labels(const struct nf_conn *ct,
struct ovs_key_ct_labels *labels)
{
struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
- if (cl) {
- size_t len = sizeof(cl->bits);
+ if (cl)
+ memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
+ else
+ memset(labels, 0, OVS_CT_LABELS_LEN);
+}
- if (len > OVS_CT_LABELS_LEN)
- len = OVS_CT_LABELS_LEN;
- else if (len < OVS_CT_LABELS_LEN)
- memset(labels, 0, OVS_CT_LABELS_LEN);
- memcpy(labels, cl->bits, len);
+static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key,
+ const struct nf_conntrack_tuple *orig,
+ u8 icmp_proto)
+{
+ key->ct_orig_proto = orig->dst.protonum;
+ if (orig->dst.protonum == icmp_proto) {
+ key->ct.orig_tp.src = htons(orig->dst.u.icmp.type);
+ key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code);
} else {
- memset(labels, 0, OVS_CT_LABELS_LEN);
+ key->ct.orig_tp.src = orig->src.u.all;
+ key->ct.orig_tp.dst = orig->dst.u.all;
}
}
@@ -151,13 +168,42 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
const struct nf_conntrack_zone *zone,
const struct nf_conn *ct)
{
- key->ct.state = state;
- key->ct.zone = zone->id;
+ key->ct_state = state;
+ key->ct_zone = zone->id;
key->ct.mark = ovs_ct_get_mark(ct);
ovs_ct_get_labels(ct, &key->ct.labels);
+
+ if (ct) {
+ const struct nf_conntrack_tuple *orig;
+
+ /* Use the master if we have one. */
+ if (ct->master)
+ ct = ct->master;
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+ /* IP version must match with the master connection. */
+ if (key->eth.type == htons(ETH_P_IP) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4) {
+ key->ipv4.ct_orig.src = orig->src.u3.ip;
+ key->ipv4.ct_orig.dst = orig->dst.u3.ip;
+ __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP);
+ return;
+ } else if (key->eth.type == htons(ETH_P_IPV6) &&
+ !sw_flow_key_is_nd(key) &&
+ nf_ct_l3num(ct) == NFPROTO_IPV6) {
+ key->ipv6.ct_orig.src = orig->src.u3.in6;
+ key->ipv6.ct_orig.dst = orig->dst.u3.in6;
+ __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP);
+ return;
+ }
+ }
+ /* Clear 'ct_orig_proto' to mark the non-existence of conntrack
+ * original direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
* previously sent the packet to conntrack via the ct action. If
* 'keep_nat_flags' is true, the existing NAT flags retained, else they are
* initialized from the connection status.
@@ -184,7 +230,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
if (ct->master)
state |= OVS_CS_F_RELATED;
if (keep_nat_flags) {
- state |= key->ct.state & OVS_CS_F_NAT_MASK;
+ state |= key->ct_state & OVS_CS_F_NAT_MASK;
} else {
if (ct->status & IPS_SRC_NAT)
state |= OVS_CS_F_SRC_NAT;
@@ -208,44 +254,69 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
ovs_ct_update_key(skb, NULL, key, false, false);
}
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
+#define IN6_ADDR_INITIALIZER(ADDR) \
+ { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
+ (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
+
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb)
{
- if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
+ if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
- nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
+ nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
- nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
+ nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
- nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
- &key->ct.labels))
+ nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels),
+ &output->ct.labels))
return -EMSGSIZE;
+ if (swkey->ct_orig_proto) {
+ if (swkey->eth.type == htons(ETH_P_IP)) {
+ struct ovs_key_ct_tuple_ipv4 orig = {
+ output->ipv4.ct_orig.src,
+ output->ipv4.ct_orig.dst,
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+ struct ovs_key_ct_tuple_ipv6 orig = {
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
+ IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
+ output->ct.orig_tp.src,
+ output->ct.orig_tp.dst,
+ output->ct_orig_proto,
+ };
+ if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
+ sizeof(orig), &orig))
+ return -EMSGSIZE;
+ }
+ }
+
return 0;
}
-static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
u32 ct_mark, u32 mask)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
u32 new_mark;
- /* The connection could be invalid, in which case set_mark is no-op. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
-
new_mark = ct_mark | (ct->mark & ~(mask));
if (ct->mark != new_mark) {
ct->mark = new_mark;
- nf_conntrack_event_cache(IPCT_MARK, ct);
+ if (nf_ct_is_confirmed(ct))
+ nf_conntrack_event_cache(IPCT_MARK, ct);
key->ct.mark = new_mark;
}
@@ -255,34 +326,83 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
#endif
}
-static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key,
- const struct ovs_key_ct_labels *labels,
- const struct ovs_key_ct_labels *mask)
+static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
{
- enum ip_conntrack_info ctinfo;
struct nf_conn_labels *cl;
- struct nf_conn *ct;
- int err;
-
- /* The connection could be invalid, in which case set_label is no-op.*/
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return 0;
cl = nf_ct_labels_find(ct);
if (!cl) {
nf_ct_labels_ext_add(ct);
cl = nf_ct_labels_find(ct);
}
- if (!cl || sizeof(cl->bits) < OVS_CT_LABELS_LEN)
+
+ return cl;
+}
+
+/* Initialize labels for a new, yet to be committed conntrack entry. Note that
+ * since the new connection is not yet confirmed, and thus no-one else has
+ * access to it's labels, we simply write them over.
+ */
+static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl, *master_cl;
+ bool have_mask = labels_nonzero(mask);
+
+ /* Inherit master's labels to the related connection? */
+ master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL;
+
+ if (!master_cl && !have_mask)
+ return 0; /* Nothing to do. */
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
+ return -ENOSPC;
+
+ /* Inherit the master's labels, if any. */
+ if (master_cl)
+ *cl = *master_cl;
+
+ if (have_mask) {
+ u32 *dst = (u32 *)cl->bits;
+ int i;
+
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
+ (labels->ct_labels_32[i]
+ & mask->ct_labels_32[i]);
+ }
+
+ /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
+ * IPCT_LABEL bit is set in the event cache.
+ */
+ nf_conntrack_event_cache(IPCT_LABEL, ct);
+
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
+ return 0;
+}
+
+static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
+ const struct ovs_key_ct_labels *labels,
+ const struct ovs_key_ct_labels *mask)
+{
+ struct nf_conn_labels *cl;
+ int err;
+
+ cl = ovs_ct_get_conn_labels(ct);
+ if (!cl)
return -ENOSPC;
- err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask,
- OVS_CT_LABELS_LEN / sizeof(u32));
+ err = nf_connlabels_replace(ct, labels->ct_labels_32,
+ mask->ct_labels_32,
+ OVS_CT_LABELS_LEN_32);
if (err)
return err;
- ovs_ct_get_labels(ct, &key->ct.labels);
+ memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
+
return 0;
}
@@ -367,7 +487,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- skb_orphan(skb);
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
err = nf_ct_frag6_gather(net, skb, user);
if (err) {
@@ -397,10 +516,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
u16 proto, const struct sk_buff *skb)
{
struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_expect *exp;
if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
return NULL;
- return __nf_ct_expect_find(net, zone, &tuple);
+
+ exp = __nf_ct_expect_find(net, zone, &tuple);
+ if (exp) {
+ struct nf_conntrack_tuple_hash *h;
+
+ /* Delete existing conntrack entry, if it clashes with the
+ * expectation. This can happen since conntrack ALGs do not
+ * check for clashes between (new) expectations and existing
+ * conntrack entries. nf_conntrack_in() will check the
+ * expectations only if a conntrack entry can not be found,
+ * which can lead to OVS finding the expectation (here) in the
+ * init direction, but which will not be removed by the
+ * nf_conntrack_in() call, if a matching conntrack entry is
+ * found instead. In this case all init direction packets
+ * would be reported as new related packets, while reply
+ * direction packets would be reported as un-related
+ * established packets.
+ */
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (h) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ nf_ct_delete(ct, 0, 0);
+ nf_conntrack_put(&ct->ct_general);
+ }
+ }
+
+ return exp;
}
/* This replicates logic from nf_conntrack_core.c that is not exported. */
@@ -421,16 +568,16 @@ ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
/* Find an existing connection which this packet belongs to without
* re-attributing statistics or modifying the connection state. This allows an
- * skb->nfct lost due to an upcall to be recovered during actions execution.
+ * skb->_nfct lost due to an upcall to be recovered during actions execution.
*
* Must be called with rcu_read_lock.
*
- * On success, populates skb->nfct and skb->nfctinfo, and returns the
- * connection. Returns NULL if there is no existing entry.
+ * On success, populates skb->_nfct and returns the connection. Returns NULL
+ * if there is no existing entry.
*/
static struct nf_conn *
ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
- u8 l3num, struct sk_buff *skb)
+ u8 l3num, struct sk_buff *skb, bool natted)
{
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
@@ -453,6 +600,17 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
return NULL;
}
+ /* Must invert the tuple if skb has been transformed by NAT. */
+ if (natted) {
+ struct nf_conntrack_tuple inverse;
+
+ if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+ pr_debug("ovs_ct_find_existing: Inversion failed!\n");
+ return NULL;
+ }
+ tuple = inverse;
+ }
+
/* look for tuple match */
h = nf_conntrack_find_get(net, zone, &tuple);
if (!h)
@@ -460,12 +618,46 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
ct = nf_ct_tuplehash_to_ctrack(h);
- skb->nfct = &ct->ct_general;
- skb->nfctinfo = ovs_ct_get_info(h);
+ /* Inverted packet tuple matches the reverse direction conntrack tuple,
+ * select the other tuplehash to get the right 'ctinfo' bits for this
+ * packet.
+ */
+ if (natted)
+ h = &ct->tuplehash[!h->tuple.dst.dir];
+
+ nf_ct_set(skb, ct, ovs_ct_get_info(h));
return ct;
}
-/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
+static
+struct nf_conn *ovs_ct_executed(struct net *net,
+ const struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb,
+ bool *ct_executed)
+{
+ struct nf_conn *ct = NULL;
+
+ /* If no ct, check if we have evidence that an existing conntrack entry
+ * might be found for this skb. This happens when we lose a skb->_nfct
+ * due to an upcall, or if the direction is being forced. If the
+ * connection was not confirmed, it is not cached and needs to be run
+ * through conntrack again.
+ */
+ *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) &&
+ !(key->ct_state & OVS_CS_F_INVALID) &&
+ (key->ct_zone == info->zone.id);
+
+ if (*ct_executed || (!key->ct_state && info->force)) {
+ ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
+ !!(key->ct_state &
+ OVS_CS_F_NAT_MASK));
+ }
+
+ return ct;
+}
+
+/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
static bool skb_nfct_cached(struct net *net,
const struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
@@ -473,19 +665,17 @@ static bool skb_nfct_cached(struct net *net,
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
+ bool ct_executed = true;
ct = nf_ct_get(skb, &ctinfo);
- /* If no ct, check if we have evidence that an existing conntrack entry
- * might be found for this skb. This happens when we lose a skb->nfct
- * due to an upcall. If the connection was not confirmed, it is not
- * cached and needs to be run through conntrack again.
- */
- if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
- !(key->ct.state & OVS_CS_F_INVALID) &&
- key->ct.zone == info->zone.id)
- ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
if (!ct)
+ ct = ovs_ct_executed(net, key, info, skb, &ct_executed);
+
+ if (ct)
+ nf_ct_get(skb, &ctinfo);
+ else
return false;
+
if (!net_eq(net, read_pnet(&ct->ct_net)))
return false;
if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
@@ -497,8 +687,20 @@ static bool skb_nfct_cached(struct net *net,
if (help && rcu_access_pointer(help->helper) != info->helper)
return false;
}
+ /* Force conntrack entry direction to the current packet? */
+ if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ /* Delete the conntrack entry if confirmed, else just release
+ * the reference.
+ */
+ if (nf_ct_is_confirmed(ct))
+ nf_ct_delete(ct, 0, 0);
+
+ nf_conntrack_put(&ct->ct_general);
+ nf_ct_set(skb, NULL, 0);
+ return false;
+ }
- return true;
+ return ct_executed;
}
#ifdef CONFIG_NF_NAT_NEEDED
@@ -591,7 +793,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
if (maniptype == NF_NAT_MANIP_SRC) {
__be16 src;
- key->ct.state |= OVS_CS_F_SRC_NAT;
+ key->ct_state |= OVS_CS_F_SRC_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.src = ip_hdr(skb)->saddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -613,7 +815,7 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
} else {
__be16 dst;
- key->ct.state |= OVS_CS_F_DST_NAT;
+ key->ct_state |= OVS_CS_F_DST_NAT;
if (key->eth.type == htons(ETH_P_IP))
key->ipv4.addr.dst = ip_hdr(skb)->daddr;
else if (key->eth.type == htons(ETH_P_IPV6))
@@ -644,11 +846,6 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
enum nf_nat_manip_type maniptype;
int err;
- if (nf_ct_is_untracked(ct)) {
- /* A NAT action may only be performed on tracked packets. */
- return NF_ACCEPT;
- }
-
/* Add NAT extension if not confirmed yet. */
if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
return NF_ACCEPT; /* Can't NAT. */
@@ -700,7 +897,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
* not done already. Update key with new CT state after passing the packet
* through conntrack.
- * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
* set to NULL and 0 will be returned.
*/
static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
@@ -722,11 +919,10 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
/* Associate skb with specified zone. */
if (tmpl) {
- if (skb->nfct)
- nf_conntrack_put(skb->nfct);
+ if (skb_nfct(skb))
+ nf_conntrack_put(skb_nfct(skb));
nf_conntrack_get(&tmpl->ct_general);
- skb->nfct = &tmpl->ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ nf_ct_set(skb, tmpl, IP_CT_NEW);
}
err = nf_conntrack_in(net, info->family,
@@ -738,7 +934,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* NAT after the nf_conntrack_in() call. We can actually clear
* the whole state, as it will be re-initialized below.
*/
- key->ct.state = 0;
+ key->ct_state = 0;
/* Update the key, but keep the NAT flags. */
ovs_ct_update_key(skb, info, key, true, true);
@@ -754,9 +950,9 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
*
* NAT will be done only if the CT action has NAT, and only
* once per packet (per zone), as guarded by the NAT bits in
- * the key->ct.state.
+ * the key->ct_state.
*/
- if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+ if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
(nf_ct_is_confirmed(ct) || info->commit) &&
ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
return -EINVAL;
@@ -820,7 +1016,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
if (err)
return err;
- ct = (struct nf_conn *)skb->nfct;
+ ct = (struct nf_conn *)skb_nfct(skb);
if (ct)
nf_ct_deliver_cached_events(ct);
}
@@ -832,8 +1028,8 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
{
size_t i;
- for (i = 0; i < sizeof(*labels); i++)
- if (labels->ct_labels[i])
+ for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
+ if (labels->ct_labels_32[i])
return true;
return false;
@@ -844,24 +1040,50 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
int err;
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
+ /* The connection could be invalid, in which case this is a no-op.*/
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return 0;
+
+ /* Set the conntrack event mask if given. NEW and DELETE events have
+ * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
+ * typically would receive many kinds of updates. Setting the event
+ * mask allows those events to be filtered. The set event mask will
+ * remain in effect for the lifetime of the connection unless changed
+ * by a further CT action with both the commit flag and the eventmask
+ * option. */
+ if (info->have_eventmask) {
+ struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct);
+
+ if (cache)
+ cache->ctmask = info->eventmask;
+ }
+
/* Apply changes before confirming the connection so that the initial
* conntrack NEW netlink event carries the values given in the CT
* action.
*/
if (info->mark.mask) {
- err = ovs_ct_set_mark(skb, key, info->mark.value,
+ err = ovs_ct_set_mark(ct, key, info->mark.value,
info->mark.mask);
if (err)
return err;
}
- if (labels_nonzero(&info->labels.mask)) {
- err = ovs_ct_set_labels(skb, key, &info->labels.value,
+ if (!nf_ct_is_confirmed(ct)) {
+ err = ovs_ct_init_labels(ct, key, &info->labels.value,
+ &info->labels.mask);
+ if (err)
+ return err;
+ } else if (labels_nonzero(&info->labels.mask)) {
+ err = ovs_ct_set_labels(ct, key, &info->labels.value,
&info->labels.mask);
if (err)
return err;
@@ -922,7 +1144,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
if (!help) {
- module_put(helper->me);
+ nf_conntrack_helper_put(helper);
return -ENOMEM;
}
@@ -1063,6 +1285,7 @@ static int parse_nat(const struct nlattr *attr,
static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
+ [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 },
[OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
.maxlen = sizeof(u16) },
[OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
@@ -1075,6 +1298,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
/* NAT length is checked when parsing the nested attributes. */
[OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
#endif
+ [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32),
+ .maxlen = sizeof(u32) },
};
static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -1102,6 +1327,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
}
switch (type) {
+ case OVS_CT_ATTR_FORCE_COMMIT:
+ info->force = true;
+ /* fall through. */
case OVS_CT_ATTR_COMMIT:
info->commit = true;
break;
@@ -1150,6 +1378,11 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
break;
}
#endif
+ case OVS_CT_ATTR_EVENTMASK:
+ info->have_eventmask = true;
+ info->eventmask = nla_get_u32(a);
+ break;
+
default:
OVS_NLERR(log, "Unknown conntrack attr (%d)",
type);
@@ -1328,7 +1561,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
if (!start)
return -EMSGSIZE;
- if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
+ if (ct_info->commit && nla_put_flag(skb, ct_info->force
+ ? OVS_CT_ATTR_FORCE_COMMIT
+ : OVS_CT_ATTR_COMMIT))
return -EMSGSIZE;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
@@ -1347,6 +1582,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
ct_info->helper->name))
return -EMSGSIZE;
}
+ if (ct_info->have_eventmask &&
+ nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask))
+ return -EMSGSIZE;
+
#ifdef CONFIG_NF_NAT_NEEDED
if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
return -EMSGSIZE;
@@ -1366,7 +1605,7 @@ void ovs_ct_free_action(const struct nlattr *a)
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
{
if (ct_info->helper)
- module_put(ct_info->helper->me);
+ nf_conntrack_helper_put(ct_info->helper);
if (ct_info->ct)
nf_ct_tmpl_free(ct_info->ct);
}
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 8f6230bd6183..bc7efd1867ab 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -32,7 +32,8 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
const struct ovs_conntrack_info *);
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);
+int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output, struct sk_buff *skb);
void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
@@ -75,13 +76,18 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
static inline void ovs_ct_fill_key(const struct sk_buff *skb,
struct sw_flow_key *key)
{
- key->ct.state = 0;
- key->ct.zone = 0;
+ key->ct_state = 0;
+ key->ct_zone = 0;
key->ct.mark = 0;
memset(&key->ct.labels, 0, sizeof(key->ct.labels));
+ /* Clear 'ct_orig_proto' to mark the non-existence of original
+ * direction key fields.
+ */
+ key->ct_orig_proto = 0;
}
-static inline int ovs_ct_put_key(const struct sw_flow_key *key,
+static inline int ovs_ct_put_key(const struct sw_flow_key *swkey,
+ const struct sw_flow_key *output,
struct sk_buff *skb)
{
return 0;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9c62b6325f7a..45fe8c8a884d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -413,7 +413,7 @@ static void pad_packet(struct datapath *dp, struct sk_buff *skb)
size_t plen = NLA_ALIGN(skb->len) - skb->len;
if (plen > 0)
- memset(skb_put(skb, plen), 0, plen);
+ skb_put_zero(skb, plen);
}
}
@@ -453,7 +453,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
/* Complete checksum if needed */
if (skb->ip_summed == CHECKSUM_PARTIAL &&
- (err = skb_checksum_help(skb)))
+ (err = skb_csum_hwoffload_help(skb, 0)))
goto out;
/* Older versions of OVS user space enforce alignment of the last
@@ -1090,6 +1090,58 @@ static struct sw_flow_actions *get_flow_actions(struct net *net,
return acts;
}
+/* Factor out match-init and action-copy to avoid
+ * "Wframe-larger-than=1024" warning. Because mask is only
+ * used to get actions, we new a function to save some
+ * stack space.
+ *
+ * If there are not key and action attrs, we return 0
+ * directly. In the case, the caller will also not use the
+ * match as before. If there is action attr, we try to get
+ * actions and save them to *acts. Before returning from
+ * the function, we reset the match->mask pointer. Because
+ * we should not to return match object with dangling reference
+ * to mask.
+ * */
+static int ovs_nla_init_match_and_action(struct net *net,
+ struct sw_flow_match *match,
+ struct sw_flow_key *key,
+ struct nlattr **a,
+ struct sw_flow_actions **acts,
+ bool log)
+{
+ struct sw_flow_mask mask;
+ int error = 0;
+
+ if (a[OVS_FLOW_ATTR_KEY]) {
+ ovs_match_init(match, key, true, &mask);
+ error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+ if (error)
+ goto error;
+ }
+
+ if (a[OVS_FLOW_ATTR_ACTIONS]) {
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR(log,
+ "Flow key attribute not present in set flow.");
+ return -EINVAL;
+ }
+
+ *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
+ &mask, log);
+ if (IS_ERR(*acts)) {
+ error = PTR_ERR(*acts);
+ goto error;
+ }
+ }
+
+ /* On success, error is 0. */
+error:
+ match->mask = NULL;
+ return error;
+}
+
static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = sock_net(skb->sk);
@@ -1097,7 +1149,6 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
- struct sw_flow_mask mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
@@ -1109,34 +1160,18 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
bool ufid_present;
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
- if (a[OVS_FLOW_ATTR_KEY]) {
- ovs_match_init(&match, &key, true, &mask);
- error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
- a[OVS_FLOW_ATTR_MASK], log);
- } else if (!ufid_present) {
+ if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) {
OVS_NLERR(log,
"Flow set message rejected, Key attribute missing.");
- error = -EINVAL;
+ return -EINVAL;
}
+
+ error = ovs_nla_init_match_and_action(net, &match, &key, a,
+ &acts, log);
if (error)
goto error;
- /* Validate actions. */
- if (a[OVS_FLOW_ATTR_ACTIONS]) {
- if (!a[OVS_FLOW_ATTR_KEY]) {
- OVS_NLERR(log,
- "Flow key attribute not present in set flow.");
- error = -EINVAL;
- goto error;
- }
-
- acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
- &mask, log);
- if (IS_ERR(acts)) {
- error = PTR_ERR(acts);
- goto error;
- }
-
+ if (acts) {
/* Can allocate before locking if have acts. */
reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false,
ufid_flags);
@@ -1353,7 +1388,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
int err;
err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a,
- OVS_FLOW_ATTR_MAX, flow_policy);
+ OVS_FLOW_ATTR_MAX, flow_policy, NULL);
if (err)
return err;
ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 1c6e9377436d..5d8dcd88815f 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -34,8 +34,6 @@
#define DP_MAX_PORTS USHRT_MAX
#define DP_VPORT_HASH_BUCKETS 1024
-#define SAMPLE_ACTION_DEPTH 3
-
/**
* struct dp_stats_percpu - per-cpu packet processing statistics for a given
* datapath.
@@ -100,8 +98,8 @@ struct datapath {
* @input_vport: The original vport packet came in on. This value is cached
* when a packet is received by OVS.
* @mru: The maximum received fragement size; 0 if the packet is not
- * @cutlen: The number of bytes from the packet end to be removed.
* fragmented.
+ * @cutlen: The number of bytes from the packet end to be removed.
*/
struct ovs_skb_cb {
struct vport *input_vport;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2c0a00f7f1b7..3f76cb765e5b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -527,7 +527,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
/* Link layer. */
clear_vlan(key);
- if (key->mac_proto == MAC_PROTO_NONE) {
+ if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
if (unlikely(eth_type_vlan(skb->protocol)))
return -EINVAL;
@@ -745,7 +745,13 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
{
- return key_extract(skb, key);
+ int res;
+
+ res = key_extract(skb, key);
+ if (!res)
+ key->mac_proto &= ~SW_FLOW_KEY_INVALID;
+
+ return res;
}
static int key_extract_mac_proto(struct sk_buff *skb)
@@ -765,7 +771,7 @@ static int key_extract_mac_proto(struct sk_buff *skb)
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
- int res;
+ int res, err;
/* Extract metadata from packet. */
if (tun_info) {
@@ -792,7 +798,6 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->phy.priority = skb->priority;
key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
key->phy.skb_mark = skb->mark;
- ovs_ct_fill_key(skb, key);
key->ovs_flow_hash = 0;
res = key_extract_mac_proto(skb);
if (res < 0)
@@ -800,17 +805,26 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
key->mac_proto = res;
key->recirc_id = 0;
- return key_extract(skb, key);
+ err = key_extract(skb, key);
+ if (!err)
+ ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */
+ return err;
}
int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
struct sk_buff *skb,
struct sw_flow_key *key, bool log)
{
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+ u64 attrs = 0;
int err;
+ err = parse_flow_nlattrs(attr, a, &attrs, log);
+ if (err)
+ return -EINVAL;
+
/* Extract metadata from netlink attributes. */
- err = ovs_nla_get_flow_metadata(net, attr, key, log);
+ err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
if (err)
return err;
@@ -824,5 +838,21 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
*/
skb->protocol = key->eth.type;
- return key_extract(skb, key);
+ err = key_extract(skb, key);
+ if (err)
+ return err;
+
+ /* Check that we have conntrack original direction tuple metadata only
+ * for packets for which it makes sense. Otherwise the key may be
+ * corrupted due to overlapping key fields.
+ */
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
+ key->eth.type != htons(ETH_P_IP))
+ return -EINVAL;
+ if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
+ (key->eth.type != htons(ETH_P_IPV6) ||
+ sw_flow_key_is_nd(key)))
+ return -EINVAL;
+
+ return 0;
}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index f61cae7f9030..a9bc1c875965 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -85,6 +85,11 @@ struct sw_flow_key {
struct vlan_head cvlan;
__be16 type; /* Ethernet frame type. */
} eth;
+ /* Filling a hole of two bytes. */
+ u8 ct_state;
+ u8 ct_orig_proto; /* CT original direction tuple IP
+ * protocol.
+ */
union {
struct {
__be32 top_lse; /* top label stack entry */
@@ -96,6 +101,7 @@ struct sw_flow_key {
u8 frag; /* One of OVS_FRAG_TYPE_*. */
} ip;
};
+ u16 ct_zone; /* Conntrack zone. */
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
@@ -107,10 +113,16 @@ struct sw_flow_key {
__be32 src; /* IP source address. */
__be32 dst; /* IP destination address. */
} addr;
- struct {
- u8 sha[ETH_ALEN]; /* ARP source hardware address. */
- u8 tha[ETH_ALEN]; /* ARP target hardware address. */
- } arp;
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ u8 sha[ETH_ALEN]; /* ARP source hardware address. */
+ u8 tha[ETH_ALEN]; /* ARP target hardware address. */
+ } arp;
+ };
} ipv4;
struct {
struct {
@@ -118,23 +130,40 @@ struct sw_flow_key {
struct in6_addr dst; /* IPv6 destination address. */
} addr;
__be32 label; /* IPv6 flow label. */
- struct {
- struct in6_addr target; /* ND target address. */
- u8 sll[ETH_ALEN]; /* ND source link layer address. */
- u8 tll[ETH_ALEN]; /* ND target link layer address. */
- } nd;
+ union {
+ struct {
+ struct in6_addr src;
+ struct in6_addr dst;
+ } ct_orig; /* Conntrack original direction fields. */
+ struct {
+ struct in6_addr target; /* ND target address. */
+ u8 sll[ETH_ALEN]; /* ND source link layer address. */
+ u8 tll[ETH_ALEN]; /* ND target link layer address. */
+ } nd;
+ };
} ipv6;
};
struct {
- /* Connection tracking fields. */
- u16 zone;
+ /* Connection tracking fields not packed above. */
+ struct {
+ __be16 src; /* CT orig tuple tp src port. */
+ __be16 dst; /* CT orig tuple tp dst port. */
+ } orig_tp;
u32 mark;
- u8 state;
struct ovs_key_ct_labels labels;
} ct;
} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */
+static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key)
+{
+ return key->eth.type == htons(ETH_P_IPV6) &&
+ key->ip.proto == NEXTHDR_ICMP &&
+ key->tp.dst == 0 &&
+ (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+ key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT));
+}
+
struct sw_flow_key_range {
unsigned short int start;
unsigned short int end;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index c87d359b9b37..f07d10ac35d8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2017 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -59,6 +59,39 @@ struct ovs_len_tbl {
#define OVS_ATTR_NESTED -1
#define OVS_ATTR_VARIABLE -2
+static bool actions_may_change_flow(const struct nlattr *actions)
+{
+ struct nlattr *nla;
+ int rem;
+
+ nla_for_each_nested(nla, actions, rem) {
+ u16 action = nla_type(nla);
+
+ switch (action) {
+ case OVS_ACTION_ATTR_OUTPUT:
+ case OVS_ACTION_ATTR_RECIRC:
+ case OVS_ACTION_ATTR_TRUNC:
+ case OVS_ACTION_ATTR_USERSPACE:
+ break;
+
+ case OVS_ACTION_ATTR_CT:
+ case OVS_ACTION_ATTR_HASH:
+ case OVS_ACTION_ATTR_POP_ETH:
+ case OVS_ACTION_ATTR_POP_MPLS:
+ case OVS_ACTION_ATTR_POP_VLAN:
+ case OVS_ACTION_ATTR_PUSH_ETH:
+ case OVS_ACTION_ATTR_PUSH_MPLS:
+ case OVS_ACTION_ATTR_PUSH_VLAN:
+ case OVS_ACTION_ATTR_SAMPLE:
+ case OVS_ACTION_ATTR_SET:
+ case OVS_ACTION_ATTR_SET_MASKED:
+ default:
+ return true;
+ }
+ }
+ return false;
+}
+
static void update_range(struct sw_flow_match *match,
size_t offset, size_t size, bool is_mask)
{
@@ -129,7 +162,9 @@ static bool match_validate(const struct sw_flow_match *match,
/* The following mask attributes allowed only if they
* pass the validation tests. */
mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)
| (1 << OVS_KEY_ATTR_IPV6)
+ | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)
| (1 << OVS_KEY_ATTR_TCP)
| (1 << OVS_KEY_ATTR_TCP_FLAGS)
| (1 << OVS_KEY_ATTR_UDP)
@@ -161,8 +196,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IP)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV4;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -196,8 +233,10 @@ static bool match_validate(const struct sw_flow_match *match,
if (match->key->eth.type == htons(ETH_P_IPV6)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV6;
- if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ if (match->mask && match->mask->key.eth.type == htons(0xffff)) {
mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+ mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6;
+ }
if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
if (match->key->ip.proto == IPPROTO_UDP) {
@@ -230,6 +269,12 @@ static bool match_validate(const struct sw_flow_match *match,
htons(NDISC_NEIGHBOUR_SOLICITATION) ||
match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
key_expected |= 1 << OVS_KEY_ATTR_ND;
+ /* Original direction conntrack tuple
+ * uses the same space as the ND fields
+ * in the key, so both are not allowed
+ * at the same time.
+ */
+ mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
if (match->mask && (match->mask->key.tp.src == htons(0xff)))
mask_allowed |= 1 << OVS_KEY_ATTR_ND;
}
@@ -282,7 +327,7 @@ size_t ovs_key_attr_size(void)
/* Whenever adding new OVS_KEY_ FIELDS, we should consider
* updating this function.
*/
- BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26);
+ BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 28);
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
@@ -295,6 +340,7 @@ size_t ovs_key_attr_size(void)
+ nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */
+ nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */
+ nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */
+ + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
+ nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ nla_total_size(4) /* OVS_KEY_ATTR_VLAN */
@@ -355,6 +401,10 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
[OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv4) },
+ [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = {
+ .len = sizeof(struct ovs_key_ct_tuple_ipv6) },
};
static bool check_attr_len(unsigned int attr_len, unsigned int expected_len)
@@ -430,9 +480,8 @@ static int parse_flow_mask_nlattrs(const struct nlattr *attr,
return __parse_flow_nlattrs(attr, a, attrsp, log, true);
}
-static int parse_flow_nlattrs(const struct nlattr *attr,
- const struct nlattr *a[], u64 *attrsp,
- bool log)
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log)
{
return __parse_flow_nlattrs(attr, a, attrsp, log, false);
}
@@ -588,7 +637,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
ipv4 = true;
break;
case OVS_TUNNEL_KEY_ATTR_IPV6_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src,
nla_get_in6_addr(a), is_mask);
ipv6 = true;
break;
@@ -649,6 +698,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_VXLAN_OPT;
opts_type = type;
break;
+ case OVS_TUNNEL_KEY_ATTR_PAD:
+ break;
default:
OVS_NLERR(log, "Unknown IP tunnel attribute %d",
type);
@@ -1056,14 +1107,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
return -EINVAL;
}
- SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) &&
ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) {
u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]);
- SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE);
}
if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) &&
@@ -1082,6 +1133,34 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
sizeof(*cl), is_mask);
*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);
}
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) {
+ const struct ovs_key_ct_tuple_ipv4 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]);
+
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask);
+ SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4);
+ }
+ if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) {
+ const struct ovs_key_ct_tuple_ipv6 *ct;
+
+ ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]);
+
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src,
+ sizeof(match->key->ipv6.ct_orig.src),
+ is_mask);
+ SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst,
+ sizeof(match->key->ipv6.ct_orig.dst),
+ is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask);
+ SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask);
+ *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6);
+ }
/* For layer 3 packets the Ethernet type is provided
* and treated as metadata but no MAC addresses are provided.
@@ -1493,9 +1572,12 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
/**
* ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key.
- * @key: Receives extracted in_port, priority, tun_key and skb_mark.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
+ * @net: Network namespace.
+ * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack
+ * metadata.
+ * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink
+ * attributes.
+ * @attrs: Bit mask for the netlink attributes included in @a.
* @log: Boolean to allow kernel error logging. Normally true, but when
* probing for feature compatibility this should be passed in as false to
* suppress unnecessary error logging.
@@ -1504,25 +1586,26 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr)
* take the same form accepted by flow_from_nlattrs(), but only enough of it to
* get the metadata, that is, the parts of the flow key that cannot be
* extracted from the packet itself.
+ *
+ * This must be called before the packet key fields are filled in 'key'.
*/
-int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr,
- struct sw_flow_key *key,
- bool log)
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log)
{
- const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
struct sw_flow_match match;
- u64 attrs = 0;
- int err;
-
- err = parse_flow_nlattrs(attr, a, &attrs, log);
- if (err)
- return -EINVAL;
memset(&match, 0, sizeof(match));
match.key = key;
+ key->ct_state = 0;
+ key->ct_zone = 0;
+ key->ct_orig_proto = 0;
memset(&key->ct, 0, sizeof(key->ct));
+ memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig));
+ memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig));
+
key->phy.in_port = DP_MAX_PORTS;
return metadata_from_nlattrs(net, &match, &attrs, a, false, log);
@@ -1584,7 +1667,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
goto nla_put_failure;
- if (ovs_ct_put_key(output, skb))
+ if (ovs_ct_put_key(swkey, output, skb))
goto nla_put_failure;
if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) {
@@ -1973,18 +2056,20 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
- int depth, struct sw_flow_actions **sfa,
+ struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log);
static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
+ const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
- __be16 eth_type, __be16 vlan_tci, bool log)
+ __be16 eth_type, __be16 vlan_tci,
+ bool log, bool last)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
const struct nlattr *a;
- int rem, start, err, st_acts;
+ int rem, start, err;
+ struct sample_arg arg;
memset(attrs, 0, sizeof(attrs));
nla_for_each_nested(a, attr, rem) {
@@ -2008,20 +2093,32 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log);
if (start < 0)
return start;
- err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY,
- nla_data(probability), sizeof(u32), log);
+
+ /* When both skb and flow may be changed, put the sample
+ * into a deferred fifo. On the other hand, if only skb
+ * may be modified, the actions can be executed in place.
+ *
+ * Do this analysis at the flow installation time.
+ * Set 'clone_action->exec' to true if the actions can be
+ * executed without being deferred.
+ *
+ * If the sample is the last action, it can always be excuted
+ * rather than deferred.
+ */
+ arg.exec = last || !actions_may_change_flow(actions);
+ arg.probability = nla_get_u32(probability);
+
+ err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg),
+ log);
if (err)
return err;
- st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log);
- if (st_acts < 0)
- return st_acts;
- err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa,
+ err = __ovs_nla_copy_actions(net, actions, key, sfa,
eth_type, vlan_tci, log);
+
if (err)
return err;
- add_nested_action_end(*sfa, st_acts);
add_nested_action_end(*sfa, start);
return 0;
@@ -2105,7 +2202,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
- tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+ tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL,
+ GFP_KERNEL);
+
if (!tun_dst)
return -ENOMEM;
@@ -2330,8 +2429,8 @@ static int validate_userspace(const struct nlattr *attr)
struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
int error;
- error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
- attr, userspace_policy);
+ error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, attr,
+ userspace_policy, NULL);
if (error)
return error;
@@ -2358,16 +2457,13 @@ static int copy_action(const struct nlattr *from,
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
- int depth, struct sw_flow_actions **sfa,
+ struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci, bool log)
{
u8 mac_proto = ovs_key_mac_proto(key);
const struct nlattr *a;
int rem, err;
- if (depth >= SAMPLE_ACTION_DEPTH)
- return -EOVERFLOW;
-
nla_for_each_nested(a, attr, rem) {
/* Expected argument lengths, (u32)-1 for variable length. */
static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
@@ -2505,13 +2601,17 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return err;
break;
- case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(net, a, key, depth, sfa,
- eth_type, vlan_tci, log);
+ case OVS_ACTION_ATTR_SAMPLE: {
+ bool last = nla_is_last(a, rem);
+
+ err = validate_and_copy_sample(net, a, key, sfa,
+ eth_type, vlan_tci,
+ log, last);
if (err)
return err;
skip_copy = true;
break;
+ }
case OVS_ACTION_ATTR_CT:
err = ovs_ct_copy_action(net, a, key, sfa, log);
@@ -2565,7 +2665,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return PTR_ERR(*sfa);
(*sfa)->orig_len = nla_len(attr);
- err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type,
+ err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type,
key->eth.vlan.tci, log);
if (err)
ovs_nla_free_flow_actions(*sfa);
@@ -2573,39 +2673,44 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return err;
}
-static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
+static int sample_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
{
- const struct nlattr *a;
- struct nlattr *start;
- int err = 0, rem;
+ struct nlattr *start, *ac_start = NULL, *sample_arg;
+ int err = 0, rem = nla_len(attr);
+ const struct sample_arg *arg;
+ struct nlattr *actions;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
if (!start)
return -EMSGSIZE;
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
- struct nlattr *st_sample;
+ sample_arg = nla_data(attr);
+ arg = nla_data(sample_arg);
+ actions = nla_next(sample_arg, &rem);
- switch (type) {
- case OVS_SAMPLE_ATTR_PROBABILITY:
- if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY,
- sizeof(u32), nla_data(a)))
- return -EMSGSIZE;
- break;
- case OVS_SAMPLE_ATTR_ACTIONS:
- st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
- if (!st_sample)
- return -EMSGSIZE;
- err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb);
- if (err)
- return err;
- nla_nest_end(skb, st_sample);
- break;
- }
+ if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ ac_start = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
+ if (!ac_start) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ovs_nla_put_actions(actions, rem, skb);
+
+out:
+ if (err) {
+ nla_nest_cancel(skb, ac_start);
+ nla_nest_cancel(skb, start);
+ } else {
+ nla_nest_end(skb, ac_start);
+ nla_nest_end(skb, start);
}
- nla_nest_end(skb, start);
return err;
}
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 45f9769e5aac..929c665ac3aa 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -46,8 +46,11 @@ void ovs_match_init(struct sw_flow_match *match,
int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *,
int attr, bool is_mask, struct sk_buff *);
-int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *,
- struct sw_flow_key *, bool log);
+int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[],
+ u64 *attrsp, bool log);
+int ovs_nla_get_flow_metadata(struct net *net,
+ const struct nlattr *a[OVS_KEY_ATTR_MAX + 1],
+ u64 attrs, struct sw_flow_key *key, bool log);
int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb);
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d5d6caecd072..04a3128adcf0 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -94,10 +94,9 @@ static void internal_dev_destructor(struct net_device *dev)
struct vport *vport = ovs_internal_dev_get_vport(dev);
ovs_vport_free(vport);
- free_netdev(dev);
}
-static struct rtnl_link_stats64 *
+static void
internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
int i;
@@ -125,8 +124,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_bytes += local_stats.tx_bytes;
stats->tx_packets += local_stats.tx_packets;
}
-
- return stats;
}
static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
@@ -151,12 +148,15 @@ static void do_setup(struct net_device *netdev)
{
ether_setup(netdev);
+ netdev->max_mtu = ETH_MAX_MTU;
+
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
IFF_PHONY_HEADROOM | IFF_NO_QUEUE;
- netdev->destructor = internal_dev_destructor;
+ netdev->needs_free_netdev = true;
+ netdev->priv_destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 7eb955e453e6..7e6301b2ec4d 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -40,14 +40,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
return -EMSGSIZE;
- if (vxlan->flags & VXLAN_F_GBP) {
+ if (vxlan->cfg.flags & VXLAN_F_GBP) {
struct nlattr *exts;
exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
if (!exts)
return -EMSGSIZE;
- if (vxlan->flags & VXLAN_F_GBP &&
+ if (vxlan->cfg.flags & VXLAN_F_GBP &&
nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
return -EMSGSIZE;
@@ -70,7 +70,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
if (nla_len(attr) < sizeof(struct nlattr))
return -EINVAL;
- err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy);
+ err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy,
+ NULL);
if (err < 0)
return err;