diff options
-rw-r--r-- | Documentation/networking/device_drivers/mellanox/mlx5.rst | 46 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h | 54 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c | 58 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h | 114 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 224 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 11 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 152 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 9 | ||||
-rw-r--r-- | include/net/flow_offload.h | 1 |
10 files changed, 545 insertions, 126 deletions
diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst index cfda464e52de..b30a63dbf4b7 100644 --- a/Documentation/networking/device_drivers/mellanox/mlx5.rst +++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst @@ -12,6 +12,7 @@ Contents - `Enabling the driver and kconfig options`_ - `Devlink info`_ - `Devlink health reporters`_ +- `mlx5 tracepoints`_ Enabling the driver and kconfig options ================================================ @@ -219,3 +220,48 @@ User commands examples: $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal NOTE: This command can run only on PF. + +mlx5 tracepoints +================ + +mlx5 driver provides internal trace points for tracking and debugging using +kernel tracepoints interfaces (refer to Documentation/trace/ftrase.rst). + +For the list of support mlx5 events check /sys/kernel/debug/tracing/events/mlx5/ + +tc and eswitch offloads tracepoints: + +- mlx5e_configure_flower: trace flower filter actions and cookies offloaded to mlx5:: + + $ echo mlx5:mlx5e_configure_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6535 [019] ...1 2672.404466: mlx5e_configure_flower: cookie=0000000067874a55 actions= REDIRECT + +- mlx5e_delete_flower: trace flower filter actions and cookies deleted from mlx5:: + + $ echo mlx5:mlx5e_delete_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6569 [010] .N.1 2686.379075: mlx5e_delete_flower: cookie=0000000067874a55 actions= NULL + +- mlx5e_stats_flower: trace flower stats request:: + + $ echo mlx5:mlx5e_stats_flower >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + tc-6546 [010] ...1 2679.704889: mlx5e_stats_flower: cookie=0000000060eb3d6a bytes=0 packets=0 lastused=4295560217 + +- mlx5e_tc_update_neigh_used_value: trace tunnel rule neigh update value offloaded to mlx5:: + + $ echo mlx5:mlx5e_tc_update_neigh_used_value >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u48:4-8806 [009] ...1 55117.882428: mlx5e_tc_update_neigh_used_value: netdev: ens1f0 IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_used=1 + +- mlx5e_rep_neigh_update: trace neigh update tasks scheduled due to neigh state change events:: + + $ echo mlx5:mlx5e_rep_neigh_update >> /sys/kernel/debug/tracing/set_event + $ cat /sys/kernel/debug/tracing/trace + ... + kworker/u48:7-2221 [009] ...1 1475.387435: mlx5e_rep_neigh_update: netdev: ens1f0 MAC: 24:8a:07:9a:17:9a IPv4: 1.1.1.10 IPv6: ::ffff:1.1.1.10 neigh_connected=1 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index a3b9659649a8..bcf36552f069 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -35,7 +35,7 @@ mlx5_core-$(CONFIG_MLX5_EN_RXNFC) += en_fs_ethtool.o mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o mlx5_core-$(CONFIG_MLX5_ESWITCH) += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \ lib/geneve.o en/tc_tun_vxlan.o en/tc_tun_gre.o \ - en/tc_tun_geneve.o + en/tc_tun_geneve.o diag/en_tc_tracepoint.o # # Core extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h new file mode 100644 index 000000000000..1177860a2ee4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_EN_REP_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_EN_REP_TP_ + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> +#include "en_rep.h" + +TRACE_EVENT(mlx5e_rep_neigh_update, + TP_PROTO(const struct mlx5e_neigh_hash_entry *nhe, const u8 *ha, + bool neigh_connected), + TP_ARGS(nhe, ha, neigh_connected), + TP_STRUCT__entry(__string(devname, nhe->m_neigh.dev->name) + __array(u8, ha, ETH_ALEN) + __array(u8, v4, 4) + __array(u8, v6, 16) + __field(bool, neigh_connected) + ), + TP_fast_assign(const struct mlx5e_neigh *mn = &nhe->m_neigh; + struct in6_addr *pin6; + __be32 *p32; + + __assign_str(devname, mn->dev->name); + __entry->neigh_connected = neigh_connected; + memcpy(__entry->ha, ha, ETH_ALEN); + + p32 = (__be32 *)__entry->v4; + pin6 = (struct in6_addr *)__entry->v6; + if (mn->family == AF_INET) { + *p32 = mn->dst_ip.v4; + ipv6_addr_set_v4mapped(*p32, pin6); + } else if (mn->family == AF_INET6) { + *pin6 = mn->dst_ip.v6; + } + ), + TP_printk("netdev: %s MAC: %pM IPv4: %pI4 IPv6: %pI6c neigh_connected=%d\n", + __get_str(devname), __entry->ha, + __entry->v4, __entry->v6, __entry->neigh_connected + ) +); + +#endif /* _MLX5_EN_REP_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE en_rep_tracepoint +#include <trace/define_trace.h> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c new file mode 100644 index 000000000000..c5dc6c50fa87 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2019 Mellanox Technologies. */ + +#define CREATE_TRACE_POINTS +#include "en_tc_tracepoint.h" + +void put_ids_to_array(int *ids, + const struct flow_action_entry *entries, + unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + ids[i] = entries[i].id; +} + +#define NAME_SIZE 16 + +static const char FLOWACT2STR[NUM_FLOW_ACTIONS][NAME_SIZE] = { + [FLOW_ACTION_ACCEPT] = "ACCEPT", + [FLOW_ACTION_DROP] = "DROP", + [FLOW_ACTION_TRAP] = "TRAP", + [FLOW_ACTION_GOTO] = "GOTO", + [FLOW_ACTION_REDIRECT] = "REDIRECT", + [FLOW_ACTION_MIRRED] = "MIRRED", + [FLOW_ACTION_VLAN_PUSH] = "VLAN_PUSH", + [FLOW_ACTION_VLAN_POP] = "VLAN_POP", + [FLOW_ACTION_VLAN_MANGLE] = "VLAN_MANGLE", + [FLOW_ACTION_TUNNEL_ENCAP] = "TUNNEL_ENCAP", + [FLOW_ACTION_TUNNEL_DECAP] = "TUNNEL_DECAP", + [FLOW_ACTION_MANGLE] = "MANGLE", + [FLOW_ACTION_ADD] = "ADD", + [FLOW_ACTION_CSUM] = "CSUM", + [FLOW_ACTION_MARK] = "MARK", + [FLOW_ACTION_WAKE] = "WAKE", + [FLOW_ACTION_QUEUE] = "QUEUE", + [FLOW_ACTION_SAMPLE] = "SAMPLE", + [FLOW_ACTION_POLICE] = "POLICE", + [FLOW_ACTION_CT] = "CT", +}; + +const char *parse_action(struct trace_seq *p, + int *ids, + unsigned int num) +{ + const char *ret = trace_seq_buffer_ptr(p); + unsigned int i; + + for (i = 0; i < num; i++) { + if (ids[i] < NUM_FLOW_ACTIONS) + trace_seq_printf(p, "%s ", FLOWACT2STR[ids[i]]); + else + trace_seq_printf(p, "UNKNOWN "); + } + + trace_seq_putc(p, 0); + return ret; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h new file mode 100644 index 000000000000..d4e6cfaaade3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2019 Mellanox Technologies. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mlx5 + +#if !defined(_MLX5_TC_TP_) || defined(TRACE_HEADER_MULTI_READ) +#define _MLX5_TC_TP_ + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> +#include <net/flow_offload.h> +#include "en_rep.h" + +#define __parse_action(ids, num) parse_action(p, ids, num) + +void put_ids_to_array(int *ids, + const struct flow_action_entry *entries, + unsigned int num); + +const char *parse_action(struct trace_seq *p, + int *ids, + unsigned int num); + +DECLARE_EVENT_CLASS(mlx5e_flower_template, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) + __field(unsigned int, num) + __dynamic_array(int, ids, f->rule ? + f->rule->action.num_entries : 0) + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; + __entry->num = (f->rule ? + f->rule->action.num_entries : 0); + if (__entry->num) + put_ids_to_array(__get_dynamic_array(ids), + f->rule->action.entries, + f->rule->action.num_entries); + ), + TP_printk("cookie=%p actions= %s\n", + __entry->cookie, __entry->num ? + __parse_action(__get_dynamic_array(ids), + __entry->num) : "NULL" + ) +); + +DEFINE_EVENT(mlx5e_flower_template, mlx5e_configure_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f) + ); + +DEFINE_EVENT(mlx5e_flower_template, mlx5e_delete_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f) + ); + +TRACE_EVENT(mlx5e_stats_flower, + TP_PROTO(const struct flow_cls_offload *f), + TP_ARGS(f), + TP_STRUCT__entry(__field(void *, cookie) + __field(u64, bytes) + __field(u64, packets) + __field(u64, lastused) + ), + TP_fast_assign(__entry->cookie = (void *)f->cookie; + __entry->bytes = f->stats.bytes; + __entry->packets = f->stats.pkts; + __entry->lastused = f->stats.lastused; + ), + TP_printk("cookie=%p bytes=%llu packets=%llu lastused=%llu\n", + __entry->cookie, __entry->bytes, + __entry->packets, __entry->lastused + ) +); + +TRACE_EVENT(mlx5e_tc_update_neigh_used_value, + TP_PROTO(const struct mlx5e_neigh_hash_entry *nhe, bool neigh_used), + TP_ARGS(nhe, neigh_used), + TP_STRUCT__entry(__string(devname, nhe->m_neigh.dev->name) + __array(u8, v4, 4) + __array(u8, v6, 16) + __field(bool, neigh_used) + ), + TP_fast_assign(const struct mlx5e_neigh *mn = &nhe->m_neigh; + struct in6_addr *pin6; + __be32 *p32; + + __assign_str(devname, mn->dev->name); + __entry->neigh_used = neigh_used; + + p32 = (__be32 *)__entry->v4; + pin6 = (struct in6_addr *)__entry->v6; + if (mn->family == AF_INET) { + *p32 = mn->dst_ip.v4; + ipv6_addr_set_v4mapped(*p32, pin6); + } else if (mn->family == AF_INET6) { + *pin6 = mn->dst_ip.v6; + } + ), + TP_printk("netdev: %s IPv4: %pI4 IPv6: %pI6c neigh_used=%d\n", + __get_str(devname), __entry->v4, __entry->v6, + __entry->neigh_used + ) +); + +#endif /* _MLX5_TC_TP_ */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ./diag +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE en_tc_tracepoint +#include <trace/define_trace.h> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 7ce5cb6e527e..3c0d36b2b91c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -46,6 +46,8 @@ #include "en/tc_tun.h" #include "fs_core.h" #include "lib/port_tun.h" +#define CREATE_TRACE_POINTS +#include "diag/en_rep_tracepoint.h" #define MLX5E_REP_PARAMS_DEF_LOG_SQ_SIZE \ max(0x7, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE) @@ -524,47 +526,97 @@ void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv) neigh_update->min_interval); } +static bool mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe) +{ + return refcount_inc_not_zero(&nhe->refcnt); +} + +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe); + +static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe) +{ + if (refcount_dec_and_test(&nhe->refcnt)) { + mlx5e_rep_neigh_entry_remove(nhe); + kfree_rcu(nhe, rcu); + } +} + +static struct mlx5e_neigh_hash_entry * +mlx5e_get_next_nhe(struct mlx5e_rep_priv *rpriv, + struct mlx5e_neigh_hash_entry *nhe) +{ + struct mlx5e_neigh_hash_entry *next = NULL; + + rcu_read_lock(); + + for (next = nhe ? + list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &nhe->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list) : + list_first_or_null_rcu(&rpriv->neigh_update.neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list); + next; + next = list_next_or_null_rcu(&rpriv->neigh_update.neigh_list, + &next->neigh_list, + struct mlx5e_neigh_hash_entry, + neigh_list)) + if (mlx5e_rep_neigh_entry_hold(next)) + break; + + rcu_read_unlock(); + + if (nhe) + mlx5e_rep_neigh_entry_release(nhe); + + return next; +} + static void mlx5e_rep_neigh_stats_work(struct work_struct *work) { struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv, neigh_update.neigh_stats_work.work); struct net_device *netdev = rpriv->netdev; struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5e_neigh_hash_entry *nhe; + struct mlx5e_neigh_hash_entry *nhe = NULL; rtnl_lock(); if (!list_empty(&rpriv->neigh_update.neigh_list)) mlx5e_rep_queue_neigh_stats_work(priv); - list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list) + while ((nhe = mlx5e_get_next_nhe(rpriv, nhe)) != NULL) mlx5e_tc_update_neigh_used_value(nhe); rtnl_unlock(); } -static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe) -{ - refcount_inc(&nhe->refcnt); -} - -static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe) -{ - if (refcount_dec_and_test(&nhe->refcnt)) - kfree(nhe); -} - static void mlx5e_rep_update_flows(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e, bool neigh_connected, unsigned char ha[ETH_ALEN]) { struct ethhdr *eth = (struct ethhdr *)e->encap_header; + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + bool encap_connected; + LIST_HEAD(flow_list); ASSERT_RTNL(); + /* wait for encap to be fully initialized */ + wait_for_completion(&e->res_ready); + + mutex_lock(&esw->offloads.encap_tbl_lock); + encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); + if (e->compl_result || (encap_connected == neigh_connected && + ether_addr_equal(e->h_dest, ha))) + goto unlock; + + mlx5e_take_all_encap_flows(e, &flow_list); + if ((e->flags & MLX5_ENCAP_ENTRY_VALID) && (!neigh_connected || !ether_addr_equal(e->h_dest, ha))) - mlx5e_tc_encap_flows_del(priv, e); + mlx5e_tc_encap_flows_del(priv, e, &flow_list); if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { ether_addr_copy(e->h_dest, ha); @@ -574,8 +626,11 @@ static void mlx5e_rep_update_flows(struct mlx5e_priv *priv, */ ether_addr_copy(eth->h_source, e->route_dev->dev_addr); - mlx5e_tc_encap_flows_add(priv, e); + mlx5e_tc_encap_flows_add(priv, e, &flow_list); } +unlock: + mutex_unlock(&esw->offloads.encap_tbl_lock); + mlx5e_put_encap_flow_list(priv, &flow_list); } static void mlx5e_rep_neigh_update(struct work_struct *work) @@ -587,7 +642,6 @@ static void mlx5e_rep_neigh_update(struct work_struct *work) unsigned char ha[ETH_ALEN]; struct mlx5e_priv *priv; bool neigh_connected; - bool encap_connected; u8 nud_state, dead; rtnl_lock(); @@ -605,17 +659,14 @@ static void mlx5e_rep_neigh_update(struct work_struct *work) neigh_connected = (nud_state & NUD_VALID) && !dead; + trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected); + list_for_each_entry(e, &nhe->encap_list, encap_list) { if (!mlx5e_encap_take(e)) continue; - encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID); priv = netdev_priv(e->out_dev); - - if (encap_connected != neigh_connected || - !ether_addr_equal(e->h_dest, ha)) - mlx5e_rep_update_flows(priv, e, neigh_connected, ha); - + mlx5e_rep_update_flows(priv, e, neigh_connected, ha); mlx5e_encap_put(priv, e); } mlx5e_rep_neigh_entry_release(nhe); @@ -821,6 +872,28 @@ static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb, return NOTIFY_OK; } +static void +mlx5e_rep_queue_neigh_update_work(struct mlx5e_priv *priv, + struct mlx5e_neigh_hash_entry *nhe, + struct neighbour *n) +{ + /* Take a reference to ensure the neighbour and mlx5 encap + * entry won't be destructed until we drop the reference in + * delayed work. + */ + neigh_hold(n); + + /* This assignment is valid as long as the the neigh reference + * is taken + */ + nhe->n = n; + + if (!queue_work(priv->wq, &nhe->neigh_update_work)) { + mlx5e_rep_neigh_entry_release(nhe); + neigh_release(n); + } +} + static struct mlx5e_neigh_hash_entry * mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, struct mlx5e_neigh *m_neigh); @@ -853,34 +926,13 @@ static int mlx5e_rep_netevent_event(struct notifier_block *nb, m_neigh.family = n->ops->family; memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len); - /* We are in atomic context and can't take RTNL mutex, so use - * spin_lock_bh to lookup the neigh table. bh is used since - * netevent can be called from a softirq context. - */ - spin_lock_bh(&neigh_update->encap_lock); + rcu_read_lock(); nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh); - if (!nhe) { - spin_unlock_bh(&neigh_update->encap_lock); + rcu_read_unlock(); + if (!nhe) return NOTIFY_DONE; - } - - /* This assignment is valid as long as the the neigh reference - * is taken - */ - nhe->n = n; - - /* Take a reference to ensure the neighbour and mlx5 encap - * entry won't be destructed until we drop the reference in - * delayed work. - */ - neigh_hold(n); - mlx5e_rep_neigh_entry_hold(nhe); - if (!queue_work(priv->wq, &nhe->neigh_update_work)) { - mlx5e_rep_neigh_entry_release(nhe); - neigh_release(n); - } - spin_unlock_bh(&neigh_update->encap_lock); + mlx5e_rep_queue_neigh_update_work(priv, nhe, n); break; case NETEVENT_DELAY_PROBE_TIME_UPDATE: @@ -897,19 +949,15 @@ static int mlx5e_rep_netevent_event(struct notifier_block *nb, #endif return NOTIFY_DONE; - /* We are in atomic context and can't take RTNL mutex, - * so use spin_lock_bh to walk the neigh list and look for - * the relevant device. bh is used since netevent can be - * called from a softirq context. - */ - spin_lock_bh(&neigh_update->encap_lock); - list_for_each_entry(nhe, &neigh_update->neigh_list, neigh_list) { + rcu_read_lock(); + list_for_each_entry_rcu(nhe, &neigh_update->neigh_list, + neigh_list) { if (p->dev == nhe->m_neigh.dev) { found = true; break; } } - spin_unlock_bh(&neigh_update->encap_lock); + rcu_read_unlock(); if (!found) return NOTIFY_DONE; @@ -940,7 +988,7 @@ static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv) return err; INIT_LIST_HEAD(&neigh_update->neigh_list); - spin_lock_init(&neigh_update->encap_lock); + mutex_init(&neigh_update->encap_lock); INIT_DELAYED_WORK(&neigh_update->neigh_stats_work, mlx5e_rep_neigh_stats_work); mlx5e_rep_neigh_update_init_interval(rpriv); @@ -967,6 +1015,7 @@ static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv) cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work); + mutex_destroy(&neigh_update->encap_lock); rhashtable_destroy(&neigh_update->neigh_ht); } @@ -982,28 +1031,27 @@ static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv, if (err) return err; - list_add(&nhe->neigh_list, &rpriv->neigh_update.neigh_list); + list_add_rcu(&nhe->neigh_list, &rpriv->neigh_update.neigh_list); return err; } -static void mlx5e_rep_neigh_entry_remove(struct mlx5e_priv *priv, - struct mlx5e_neigh_hash_entry *nhe) +static void mlx5e_rep_neigh_entry_remove(struct mlx5e_neigh_hash_entry *nhe) { - struct mlx5e_rep_priv *rpriv = priv->ppriv; + struct mlx5e_rep_priv *rpriv = nhe->priv->ppriv; - spin_lock_bh(&rpriv->neigh_update.encap_lock); + mutex_lock(&rpriv->neigh_update.encap_lock); - list_del(&nhe->neigh_list); + list_del_rcu(&nhe->neigh_list); rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht, &nhe->rhash_node, mlx5e_neigh_ht_params); - spin_unlock_bh(&rpriv->neigh_update.encap_lock); + mutex_unlock(&rpriv->neigh_update.encap_lock); } -/* This function must only be called under RTNL lock or under the - * representor's encap_lock in case RTNL mutex can't be held. +/* This function must only be called under the representor's encap_lock or + * inside rcu read lock section. */ static struct mlx5e_neigh_hash_entry * mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, @@ -1011,9 +1059,11 @@ mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv, { struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update; + struct mlx5e_neigh_hash_entry *nhe; - return rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh, - mlx5e_neigh_ht_params); + nhe = rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh, + mlx5e_neigh_ht_params); + return nhe && mlx5e_rep_neigh_entry_hold(nhe) ? nhe : NULL; } static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, @@ -1026,8 +1076,10 @@ static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv, if (!*nhe) return -ENOMEM; + (*nhe)->priv = priv; memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh)); INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update); + spin_lock_init(&(*nhe)->encap_list_lock); INIT_LIST_HEAD(&(*nhe)->encap_list); refcount_set(&(*nhe)->refcnt, 1); @@ -1041,19 +1093,6 @@ out_free: return err; } -static void mlx5e_rep_neigh_entry_destroy(struct mlx5e_priv *priv, - struct mlx5e_neigh_hash_entry *nhe) -{ - /* The neigh hash entry must be removed from the hash table regardless - * of the reference count value, so it won't be found by the next - * neigh notification call. The neigh hash entry reference count is - * incremented only during creation and neigh notification calls and - * protects from freeing the nhe struct. - */ - mlx5e_rep_neigh_entry_remove(priv, nhe); - mlx5e_rep_neigh_entry_release(nhe); -} - int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) { @@ -1066,16 +1105,26 @@ int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv, err = mlx5_tun_entropy_refcount_inc(tun_entropy, e->reformat_type); if (err) return err; + + mutex_lock(&rpriv->neigh_update.encap_lock); nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh); if (!nhe) { err = mlx5e_rep_neigh_entry_create(priv, e, &nhe); if (err) { + mutex_unlock(&rpriv->neigh_update.encap_lock); mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type); return err; } } - list_add(&e->encap_list, &nhe->encap_list); + + e->nhe = nhe; + spin_lock(&nhe->encap_list_lock); + list_add_rcu(&e->encap_list, &nhe->encap_list); + spin_unlock(&nhe->encap_list_lock); + + mutex_unlock(&rpriv->neigh_update.encap_lock); + return 0; } @@ -1085,13 +1134,16 @@ void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv, struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5_rep_uplink_priv *uplink_priv = &rpriv->uplink_priv; struct mlx5_tun_entropy *tun_entropy = &uplink_priv->tun_entropy; - struct mlx5e_neigh_hash_entry *nhe; - list_del(&e->encap_list); - nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh); + if (!e->nhe) + return; + + spin_lock(&e->nhe->encap_list_lock); + list_del_rcu(&e->encap_list); + spin_unlock(&e->nhe->encap_list_lock); - if (list_empty(&nhe->encap_list)) - mlx5e_rep_neigh_entry_destroy(priv, nhe); + mlx5e_rep_neigh_entry_release(e->nhe); + e->nhe = NULL; mlx5_tun_entropy_refcount_dec(tun_entropy, e->reformat_type); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 8ac96727cad8..a0ae5069d8c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -35,6 +35,7 @@ #include <net/ip_tunnels.h> #include <linux/rhashtable.h> +#include <linux/mutex.h> #include "eswitch.h" #include "en.h" #include "lib/port_tun.h" @@ -48,7 +49,7 @@ struct mlx5e_neigh_update_table { */ struct list_head neigh_list; /* protect lookup/remove operations */ - spinlock_t encap_lock; + struct mutex encap_lock; struct notifier_block netevent_nb; struct delayed_work neigh_stats_work; unsigned long min_interval; /* jiffies */ @@ -110,6 +111,7 @@ struct mlx5e_neigh { struct mlx5e_neigh_hash_entry { struct rhash_head rhash_node; struct mlx5e_neigh m_neigh; + struct mlx5e_priv *priv; /* Save the neigh hash entry in a list on the representor in * addition to the hash table. In order to iterate easily over the @@ -117,6 +119,8 @@ struct mlx5e_neigh_hash_entry { */ struct list_head neigh_list; + /* protects encap list */ + spinlock_t encap_list_lock; /* encap list sharing the same neigh */ struct list_head encap_list; @@ -137,6 +141,8 @@ struct mlx5e_neigh_hash_entry { * 'used' value and avoid neigh deleting by the kernel. */ unsigned long reported_lastuse; + + struct rcu_head rcu; }; enum { @@ -145,6 +151,8 @@ enum { }; struct mlx5e_encap_entry { + /* attached neigh hash entry */ + struct mlx5e_neigh_hash_entry *nhe; /* neigh hash entry list of encaps sharing the same neigh */ struct list_head encap_list; struct mlx5e_neigh m_neigh; @@ -167,6 +175,7 @@ struct mlx5e_encap_entry { refcount_t refcnt; struct completion res_ready; int compl_result; + struct rcu_head rcu; }; struct mlx5e_rep_sq { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3917834b48ff..5581a8045ede 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -56,6 +56,7 @@ #include "en/tc_tun.h" #include "lib/devcom.h" #include "lib/geneve.h" +#include "diag/en_tc_tracepoint.h" struct mlx5_nic_flow_attr { u32 action; @@ -126,8 +127,11 @@ struct mlx5e_tc_flow { struct list_head hairpin; /* flows sharing the same hairpin */ struct list_head peer; /* flows with peer flow */ struct list_head unready; /* flows not ready to be offloaded (e.g due to missing route) */ + int tmp_efi_index; + struct list_head tmp_list; /* temporary flow list used by neigh update */ refcount_t refcnt; struct rcu_head rcu_head; + struct completion init_done; union { struct mlx5_esw_flow_attr esw_attr[0]; struct mlx5_nic_flow_attr nic_attr[0]; @@ -1290,11 +1294,11 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, } void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e) + struct mlx5e_encap_entry *e, + struct list_head *flow_list) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_esw_flow_attr slow_attr, *esw_attr; - struct encap_flow_item *efi, *tmp; struct mlx5_flow_handle *rule; struct mlx5_flow_spec *spec; struct mlx5e_tc_flow *flow; @@ -1313,19 +1317,17 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(priv); - list_for_each_entry_safe(efi, tmp, &e->flows, list) { + list_for_each_entry(flow, flow_list, tmp_list) { bool all_flow_encaps_valid = true; int i; - flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); - if (IS_ERR(mlx5e_flow_get(flow))) + if (!mlx5e_is_offloaded_flow(flow)) continue; - esw_attr = flow->esw_attr; spec = &esw_attr->parse_attr->spec; - esw_attr->dests[efi->index].encap_id = e->encap_id; - esw_attr->dests[efi->index].flags |= MLX5_ESW_DEST_ENCAP_VALID; + esw_attr->dests[flow->tmp_efi_index].encap_id = e->encap_id; + esw_attr->dests[flow->tmp_efi_index].flags |= MLX5_ESW_DEST_ENCAP_VALID; /* Flow can be associated with multiple encap entries. * Before offloading the flow verify that all of them have * a valid neighbour. @@ -1340,63 +1342,55 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, } /* Do not offload flows with unresolved neighbors */ if (!all_flow_encaps_valid) - goto loop_cont; + continue; /* update from slow path rule to encap rule */ rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, esw_attr); if (IS_ERR(rule)) { err = PTR_ERR(rule); mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n", err); - goto loop_cont; + continue; } mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr); flow->rule[0] = rule; /* was unset when slow path rule removed */ flow_flag_set(flow, OFFLOADED); - -loop_cont: - mlx5e_flow_put(priv, flow); } } void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e) + struct mlx5e_encap_entry *e, + struct list_head *flow_list) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_esw_flow_attr slow_attr; - struct encap_flow_item *efi, *tmp; struct mlx5_flow_handle *rule; struct mlx5_flow_spec *spec; struct mlx5e_tc_flow *flow; int err; - list_for_each_entry_safe(efi, tmp, &e->flows, list) { - flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); - if (IS_ERR(mlx5e_flow_get(flow))) + list_for_each_entry(flow, flow_list, tmp_list) { + if (!mlx5e_is_offloaded_flow(flow)) continue; - spec = &flow->esw_attr->parse_attr->spec; /* update from encap rule to slow path rule */ rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec, &slow_attr); /* mark the flow's encap dest as non-valid */ - flow->esw_attr->dests[efi->index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; + flow->esw_attr->dests[flow->tmp_efi_index].flags &= ~MLX5_ESW_DEST_ENCAP_VALID; if (IS_ERR(rule)) { err = PTR_ERR(rule); mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n", err); - goto loop_cont; + continue; } mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr); flow->rule[0] = rule; /* was unset when fast path rule removed */ flow_flag_set(flow, OFFLOADED); - -loop_cont: - mlx5e_flow_put(priv, flow); } /* we know that the encap is valid */ @@ -1412,11 +1406,84 @@ static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow) return flow->nic_attr->counter; } +/* Takes reference to all flows attached to encap and adds the flows to + * flow_list using 'tmp_list' list_head in mlx5e_tc_flow. + */ +void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list) +{ + struct encap_flow_item *efi; + struct mlx5e_tc_flow *flow; + + list_for_each_entry(efi, &e->flows, list) { + flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); + if (IS_ERR(mlx5e_flow_get(flow))) + continue; + wait_for_completion(&flow->init_done); + + flow->tmp_efi_index = efi->index; + list_add(&flow->tmp_list, flow_list); + } +} + +/* Iterate over tmp_list of flows attached to flow_list head. */ +void mlx5e_put_encap_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list) +{ + struct mlx5e_tc_flow *flow, *tmp; + + list_for_each_entry_safe(flow, tmp, flow_list, tmp_list) + mlx5e_flow_put(priv, flow); +} + +static struct mlx5e_encap_entry * +mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe, + struct mlx5e_encap_entry *e) +{ + struct mlx5e_encap_entry *next = NULL; + +retry: + rcu_read_lock(); + + /* find encap with non-zero reference counter value */ + for (next = e ? + list_next_or_null_rcu(&nhe->encap_list, + &e->encap_list, + struct mlx5e_encap_entry, + encap_list) : + list_first_or_null_rcu(&nhe->encap_list, + struct mlx5e_encap_entry, + encap_list); + next; + next = list_next_or_null_rcu(&nhe->encap_list, + &next->encap_list, + struct mlx5e_encap_entry, + encap_list)) + if (mlx5e_encap_take(next)) + break; + + rcu_read_unlock(); + + /* release starting encap */ + if (e) + mlx5e_encap_put(netdev_priv(e->out_dev), e); + if (!next) + return next; + + /* wait for encap to be fully initialized */ + wait_for_completion(&next->res_ready); + /* continue searching if encap entry is not in valid state after completion */ + if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) { + e = next; + goto retry; + } + + return next; +} + void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) { struct mlx5e_neigh *m_neigh = &nhe->m_neigh; + struct mlx5e_encap_entry *e = NULL; struct mlx5e_tc_flow *flow; - struct mlx5e_encap_entry *e; struct mlx5_fc *counter; struct neigh_table *tbl; bool neigh_used = false; @@ -1432,37 +1499,45 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe) else return; - list_for_each_entry(e, &nhe->encap_list, encap_list) { + /* mlx5e_get_next_valid_encap() releases previous encap before returning + * next one. + */ + while ((e = mlx5e_get_next_valid_encap(nhe, e)) != NULL) { + struct mlx5e_priv *priv = netdev_priv(e->out_dev); struct encap_flow_item *efi, *tmp; + struct mlx5_eswitch *esw; + LIST_HEAD(flow_list); - if (!(e->flags & MLX5_ENCAP_ENTRY_VALID) || - !mlx5e_encap_take(e)) - continue; - + esw = priv->mdev->priv.eswitch; + mutex_lock(&esw->offloads.encap_tbl_lock); list_for_each_entry_safe(efi, tmp, &e->flows, list) { flow = container_of(efi, struct mlx5e_tc_flow, encaps[efi->index]); if (IS_ERR(mlx5e_flow_get(flow))) continue; + list_add(&flow->tmp_list, &flow_list); if (mlx5e_is_offloaded_flow(flow)) { counter = mlx5e_tc_get_counter(flow); lastuse = mlx5_fc_query_lastuse(counter); if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) { - mlx5e_flow_put(netdev_priv(e->out_dev), flow); neigh_used = true; break; } } - - mlx5e_flow_put(netdev_priv(e->out_dev), flow); } + mutex_unlock(&esw->offloads.encap_tbl_lock); - mlx5e_encap_put(netdev_priv(e->out_dev), e); - if (neigh_used) + mlx5e_put_encap_flow_list(priv, &flow_list); + if (neigh_used) { + /* release current encap before breaking the loop */ + mlx5e_encap_put(priv, e); break; + } } + trace_mlx5e_tc_update_neigh_used_value(nhe, neigh_used); + if (neigh_used) { nhe->reported_lastuse = jiffies; @@ -1490,7 +1565,7 @@ static void mlx5e_encap_dealloc(struct mlx5e_priv *priv, struct mlx5e_encap_entr } kfree(e->encap_header); - kfree(e); + kfree_rcu(e, rcu); } void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e) @@ -3426,6 +3501,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size, INIT_LIST_HEAD(&flow->mod_hdr); INIT_LIST_HEAD(&flow->hairpin); refcount_set(&flow->refcnt, 1); + init_completion(&flow->init_done); *__flow = flow; *__parse_attr = parse_attr; @@ -3498,6 +3574,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv, goto err_free; err = mlx5e_tc_add_fdb_flow(priv, flow, extack); + complete_all(&flow->init_done); if (err) { if (!(err == -ENETUNREACH && mlx5_lag_is_multipath(in_mdev))) goto err_free; @@ -3695,6 +3772,7 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv, goto out; } + trace_mlx5e_configure_flower(f); err = mlx5e_tc_add_flow(priv, f, flags, dev, &flow); if (err) goto out; @@ -3744,6 +3822,7 @@ int mlx5e_delete_flower(struct net_device *dev, struct mlx5e_priv *priv, rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params); rcu_read_unlock(); + trace_mlx5e_delete_flower(f); mlx5e_flow_put(priv, flow); return 0; @@ -3813,6 +3892,7 @@ no_peer_counter: mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS); out: flow_stats_update(&f->stats, bytes, packets, lastuse); + trace_mlx5e_stats_flower(f); errout: mlx5e_flow_put(priv, flow); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h index ea2072e2fe84..924c6ef86a14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h @@ -72,12 +72,17 @@ void mlx5e_tc_stats_matchall(struct mlx5e_priv *priv, struct mlx5e_encap_entry; void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e); + struct mlx5e_encap_entry *e, + struct list_head *flow_list); void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv, - struct mlx5e_encap_entry *e); + struct mlx5e_encap_entry *e, + struct list_head *flow_list); bool mlx5e_encap_take(struct mlx5e_encap_entry *e); void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e); +void mlx5e_take_all_encap_flows(struct mlx5e_encap_entry *e, struct list_head *flow_list); +void mlx5e_put_encap_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list); + struct mlx5e_neigh_hash_entry; void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe); diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index e8069b6c474c..757fa84de654 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -138,6 +138,7 @@ enum flow_action_id { FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, + NUM_FLOW_ACTIONS, }; /* This is mirroring enum pedit_header_type definition for easy mapping between |