aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst23
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst14
-rw-r--r--Documentation/networking/mptcp-sysctl.rst8
-rw-r--r--Documentation/networking/xfrm_device.rst1
-rw-r--r--drivers/net/bonding/bond_main.c1
-rw-r--r--drivers/net/dsa/microchip/ksz8.h2
-rw-r--r--drivers/net/dsa/microchip/ksz9477.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c21
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_stats.c22
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_stats.h13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c16
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c20
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c36
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c115
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c2
-rw-r--r--drivers/net/ethernet/microchip/sparx5/sparx5_main.h1
-rw-r--r--drivers/net/ethernet/microchip/vcap/vcap_api.h3
-rw-r--r--drivers/net/ethernet/microchip/vcap/vcap_api_client.h3
-rw-r--r--drivers/net/ethernet/mscc/ocelot.h2
-rw-r--r--drivers/net/ethernet/mscc/ocelot_vcap.h1
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic.h1
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_dev.h1
-rw-r--r--drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h1
-rw-r--r--drivers/net/ethernet/ti/am65-cpsw-nuss.h1
-rw-r--r--drivers/net/ethernet/ti/netcp.h2
-rw-r--r--drivers/net/vrf.c2
-rw-r--r--drivers/net/vxlan/vxlan_vnifilter.c2
-rw-r--r--include/net/mptcp.h21
-rw-r--r--include/soc/mscc/ocelot.h1
-rw-r--r--net/mptcp/Makefile2
-rw-r--r--net/mptcp/ctrl.c14
-rw-r--r--net/mptcp/pm.c9
-rw-r--r--net/mptcp/pm_netlink.c3
-rw-r--r--net/mptcp/protocol.c277
-rw-r--r--net/mptcp/protocol.h18
-rw-r--r--net/mptcp/sched.c173
40 files changed, 603 insertions, 251 deletions
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
index 008e560e12b5..f69ee1ebee01 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst
@@ -346,6 +346,24 @@ the software port.
- The number of receive packets with CQE compression on ring i [#accel]_.
- Acceleration
+ * - `rx[i]_arfs_add`
+ - The number of aRFS flow rules added to the device for direct RQ steering
+ on ring i [#accel]_.
+ - Acceleration
+
+ * - `rx[i]_arfs_request_in`
+ - Number of flow rules that have been requested to move into ring i for
+ direct RQ steering [#accel]_.
+ - Acceleration
+
+ * - `rx[i]_arfs_request_out`
+ - Number of flow rules that have been requested to move out of ring i [#accel]_.
+ - Acceleration
+
+ * - `rx[i]_arfs_expired`
+ - Number of flow rules that have been expired and removed [#accel]_.
+ - Acceleration
+
* - `rx[i]_arfs_err`
- Number of flow rules that failed to be added to the flow table.
- Error
@@ -445,11 +463,6 @@ the software port.
context.
- Error
- * - `rx[i]_xsk_arfs_err`
- - aRFS (accelerated Receive Flow Steering) does not occur in the XSK RQ
- context, so this counter should never increment.
- - Error
-
* - `rx[i]_xdp_tx_xmit`
- The number of packets forwarded back to the port due to XDP program
`XDP_TX` action (bouncing). these packets are not counted by other
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst
index 43b1f7e87ec4..0a42c3395ffa 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst
@@ -36,7 +36,7 @@ Enabling the driver and kconfig options
**CONFIG_MLX5_CORE_EN_DCB=(y/n)**:
-| Enables `Data Center Bridging (DCB) Support <https://community.mellanox.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_.
+| Enables `Data Center Bridging (DCB) Support <https://enterprise-support.nvidia.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_.
**CONFIG_MLX5_CORE_IPOIB=(y/n)**
@@ -59,12 +59,12 @@ Enabling the driver and kconfig options
**CONFIG_MLX5_EN_ARFS=(y/n)**
| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering.
-| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4
+| https://enterprise-support.nvidia.com/s/article/howto-configure-arfs-on-connectx-4
**CONFIG_MLX5_EN_IPSEC=(y/n)**
-| Enables `IPSec XFRM cryptography-offload acceleration <https://support.mellanox.com/s/article/ConnectX-6DX-Bluefield-2-IPsec-HW-Full-Offload-Configuration-Guide>`_.
+| Enables :ref:`IPSec XFRM cryptography-offload acceleration <xfrm_device>`.
**CONFIG_MLX5_EN_MACSEC=(y/n)**
@@ -87,8 +87,8 @@ Enabling the driver and kconfig options
| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering
| and switching for the enabled VFs and PF in two available modes:
-| 1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://community.mellanox.com/s/article/howto-configure-sr-iov-for-connectx-4-connectx-5-with-kvm--ethernet-x>`_.
-| 2) `Switchdev mode (eswitch offloads) <https://www.mellanox.com/related-docs/prod_software/ASAP2_Hardware_Offloading_for_vSwitches_User_Manual_v4.4.pdf>`_.
+| 1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://enterprise-support.nvidia.com/s/article/HowTo-Configure-SR-IOV-for-ConnectX-4-ConnectX-5-ConnectX-6-with-KVM-Ethernet>`_.
+| 2) :ref:`Switchdev mode (eswitch offloads) <switchdev>`.
**CONFIG_MLX5_FPGA=(y/n)**
@@ -101,13 +101,13 @@ Enabling the driver and kconfig options
**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko)
-| Provides low-level InfiniBand/RDMA and `RoCE <https://community.mellanox.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support.
+| Provides low-level InfiniBand/RDMA and `RoCE <https://enterprise-support.nvidia.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support.
**CONFIG_MLX5_MPFS=(y/n)**
| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC.
-| MPFs is required for when `Multi-Host <http://www.mellanox.com/page/multihost>`_ configuration is enabled to allow passing
+| MPFs is required for when `Multi-Host <https://www.nvidia.com/en-us/networking/multi-host/>`_ configuration is enabled to allow passing
| user configured unicast MAC addresses to the requesting PF.
diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst
index 213510698014..15f1919d640c 100644
--- a/Documentation/networking/mptcp-sysctl.rst
+++ b/Documentation/networking/mptcp-sysctl.rst
@@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER
This is a per-namespace sysctl.
Default: 4
+
+scheduler - STRING
+ Select the scheduler of your choice.
+
+ Support for selection of different schedulers. This is a per-namespace
+ sysctl.
+
+ Default: "default"
diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst
index 83abdfef4ec3..535077cbeb07 100644
--- a/Documentation/networking/xfrm_device.rst
+++ b/Documentation/networking/xfrm_device.rst
@@ -1,4 +1,5 @@
.. SPDX-License-Identifier: GPL-2.0
+.. _xfrm_device:
===============================================
XFRM device - offloading the IPsec computations
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f398bec78457..ed7212e61c54 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5718,6 +5718,7 @@ static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
*/
bond_for_each_slave(bond, slave, iter) {
if (bond_slave_can_tx(slave)) {
+ bond_update_speed_duplex(slave);
if (slave->speed != SPEED_UNKNOWN) {
if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
speed = bond_mode_bcast_speed(slave,
diff --git a/drivers/net/dsa/microchip/ksz8.h b/drivers/net/dsa/microchip/ksz8.h
index e68465fdf6b9..4cea811e73ac 100644
--- a/drivers/net/dsa/microchip/ksz8.h
+++ b/drivers/net/dsa/microchip/ksz8.h
@@ -48,13 +48,11 @@ int ksz8_port_mirror_add(struct ksz_device *dev, int port,
bool ingress, struct netlink_ext_ack *extack);
void ksz8_port_mirror_del(struct ksz_device *dev, int port,
struct dsa_mall_mirror_tc_entry *mirror);
-int ksz8_get_stp_reg(void);
void ksz8_get_caps(struct ksz_device *dev, int port,
struct phylink_config *config);
void ksz8_config_cpu_port(struct dsa_switch *ds);
int ksz8_enable_stp_addr(struct ksz_device *dev);
int ksz8_reset_switch(struct ksz_device *dev);
-int ksz8_switch_detect(struct ksz_device *dev);
int ksz8_switch_init(struct ksz_device *dev);
void ksz8_switch_exit(struct ksz_device *dev);
int ksz8_change_mtu(struct ksz_device *dev, int port, int mtu);
diff --git a/drivers/net/dsa/microchip/ksz9477.h b/drivers/net/dsa/microchip/ksz9477.h
index b6f7e3c46e3f..a6f425866a29 100644
--- a/drivers/net/dsa/microchip/ksz9477.h
+++ b/drivers/net/dsa/microchip/ksz9477.h
@@ -36,7 +36,6 @@ int ksz9477_port_mirror_add(struct ksz_device *dev, int port,
bool ingress, struct netlink_ext_ack *extack);
void ksz9477_port_mirror_del(struct ksz_device *dev, int port,
struct dsa_mall_mirror_tc_entry *mirror);
-int ksz9477_get_stp_reg(void);
void ksz9477_get_caps(struct ksz_device *dev, int port,
struct phylink_config *config);
int ksz9477_fdb_dump(struct ksz_device *dev, int port,
@@ -54,7 +53,6 @@ void ksz9477_config_cpu_port(struct dsa_switch *ds);
int ksz9477_tc_cbs_set_cinc(struct ksz_device *dev, int port, u32 val);
int ksz9477_enable_stp_addr(struct ksz_device *dev);
int ksz9477_reset_switch(struct ksz_device *dev);
-int ksz9477_dsa_init(struct ksz_device *dev);
int ksz9477_switch_init(struct ksz_device *dev);
void ksz9477_switch_exit(struct ksz_device *dev);
void ksz9477_port_queue_split(struct ksz_device *dev, int port);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 5aa51d74f8b4..bb7f86c993e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -432,8 +432,10 @@ static void arfs_may_expire_flow(struct mlx5e_priv *priv)
}
spin_unlock_bh(&arfs->arfs_lock);
hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) {
- if (arfs_rule->rule)
+ if (arfs_rule->rule) {
mlx5_del_flow_rules(arfs_rule->rule);
+ priv->channel_stats[arfs_rule->rxq]->rq.arfs_expired++;
+ }
hlist_del(&arfs_rule->hlist);
kfree(arfs_rule);
}
@@ -509,6 +511,7 @@ static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
if (!spec) {
+ priv->channel_stats[arfs_rule->rxq]->rq.arfs_err++;
err = -ENOMEM;
goto out;
}
@@ -519,6 +522,8 @@ static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
ntohs(tuple->etype));
arfs_table = arfs_get_table(arfs, tuple->ip_proto, tuple->etype);
if (!arfs_table) {
+ WARN_ONCE(1, "arfs table does not exist for etype %u and ip_proto %u\n",
+ tuple->etype, tuple->ip_proto);
err = -EINVAL;
goto out;
}
@@ -600,9 +605,11 @@ static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
dst.tir_num = mlx5e_rx_res_get_tirn_direct(priv->rx_res, rxq);
err = mlx5_modify_rule_destination(rule, &dst, NULL);
- if (err)
+ if (err) {
+ priv->channel_stats[rxq]->rq.arfs_err++;
netdev_warn(priv->netdev,
"Failed to modify aRFS rule destination to rq=%d\n", rxq);
+ }
}
static void arfs_handle_work(struct work_struct *work)
@@ -632,6 +639,7 @@ static void arfs_handle_work(struct work_struct *work)
if (IS_ERR(rule))
goto out;
arfs_rule->rule = rule;
+ priv->channel_stats[arfs_rule->rxq]->rq.arfs_add++;
} else {
arfs_modify_rule_rq(priv, arfs_rule->rule,
arfs_rule->rxq);
@@ -650,8 +658,10 @@ static struct arfs_rule *arfs_alloc_rule(struct mlx5e_priv *priv,
struct arfs_tuple *tuple;
rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
- if (!rule)
+ if (!rule) {
+ priv->channel_stats[rxq]->rq.arfs_err++;
return NULL;
+ }
rule->priv = priv;
rule->rxq = rxq;
@@ -740,10 +750,13 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
spin_lock_bh(&arfs->arfs_lock);
arfs_rule = arfs_find_rule(arfs_t, &fk);
if (arfs_rule) {
- if (arfs_rule->rxq == rxq_index) {
+ if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) {
spin_unlock_bh(&arfs->arfs_lock);
return arfs_rule->filter_id;
}
+
+ priv->channel_stats[rxq_index]->rq.arfs_request_in++;
+ priv->channel_stats[arfs_rule->rxq]->rq.arfs_request_out++;
arfs_rule->rxq = rxq_index;
} else {
arfs_rule = arfs_alloc_rule(priv, arfs_t, &fk, rxq_index, flow_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 52141729444e..348fb140858c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -180,7 +180,13 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) },
+#ifdef CONFIG_MLX5_EN_ARFS
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_add) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_request_in) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_request_out) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_expired) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_err) },
+#endif
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_recover) },
#ifdef CONFIG_PAGE_POOL_STATS
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_pp_alloc_fast) },
@@ -231,7 +237,6 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_blks) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_pkts) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_congst_umr) },
- { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_arfs_err) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_xmit) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_mpwqe) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_inlnw) },
@@ -321,7 +326,6 @@ static void mlx5e_stats_grp_sw_update_stats_xskrq(struct mlx5e_sw_stats *s,
s->rx_xsk_cqe_compress_blks += xskrq_stats->cqe_compress_blks;
s->rx_xsk_cqe_compress_pkts += xskrq_stats->cqe_compress_pkts;
s->rx_xsk_congst_umr += xskrq_stats->congst_umr;
- s->rx_xsk_arfs_err += xskrq_stats->arfs_err;
}
static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s,
@@ -354,7 +358,13 @@ static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s,
s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
s->rx_congst_umr += rq_stats->congst_umr;
+#ifdef CONFIG_MLX5_EN_ARFS
+ s->rx_arfs_add += rq_stats->arfs_add;
+ s->rx_arfs_request_in += rq_stats->arfs_request_in;
+ s->rx_arfs_request_out += rq_stats->arfs_request_out;
+ s->rx_arfs_expired += rq_stats->arfs_expired;
s->rx_arfs_err += rq_stats->arfs_err;
+#endif
s->rx_recover += rq_stats->recover;
#ifdef CONFIG_PAGE_POOL_STATS
s->rx_pp_alloc_fast += rq_stats->pp_alloc_fast;
@@ -1990,7 +2000,13 @@ static const struct counter_desc rq_stats_desc[] = {
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) },
+#ifdef CONFIG_MLX5_EN_ARFS
+ { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_add) },
+ { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_request_in) },
+ { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_request_out) },
+ { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_expired) },
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_err) },
+#endif
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, recover) },
#ifdef CONFIG_PAGE_POOL_STATS
{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, pp_alloc_fast) },
@@ -2092,7 +2108,6 @@ static const struct counter_desc xskrq_stats_desc[] = {
{ MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
{ MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
{ MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, congst_umr) },
- { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, arfs_err) },
};
static const struct counter_desc xsksq_stats_desc[] = {
@@ -2168,7 +2183,6 @@ static const struct counter_desc ptp_rq_stats_desc[] = {
{ MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
{ MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
{ MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, congst_umr) },
- { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, arfs_err) },
{ MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, recover) },
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 409e9a47e433..176fa5976259 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -194,7 +194,13 @@ struct mlx5e_sw_stats {
u64 rx_cqe_compress_blks;
u64 rx_cqe_compress_pkts;
u64 rx_congst_umr;
+#ifdef CONFIG_MLX5_EN_ARFS
+ u64 rx_arfs_add;
+ u64 rx_arfs_request_in;
+ u64 rx_arfs_request_out;
+ u64 rx_arfs_expired;
u64 rx_arfs_err;
+#endif
u64 rx_recover;
u64 ch_events;
u64 ch_poll;
@@ -256,7 +262,6 @@ struct mlx5e_sw_stats {
u64 rx_xsk_cqe_compress_blks;
u64 rx_xsk_cqe_compress_pkts;
u64 rx_xsk_congst_umr;
- u64 rx_xsk_arfs_err;
u64 tx_xsk_xmit;
u64 tx_xsk_mpwqe;
u64 tx_xsk_inlnw;
@@ -358,7 +363,13 @@ struct mlx5e_rq_stats {
u64 cqe_compress_blks;
u64 cqe_compress_pkts;
u64 congst_umr;
+#ifdef CONFIG_MLX5_EN_ARFS
+ u64 arfs_add;
+ u64 arfs_request_in;
+ u64 arfs_request_out;
+ u64 arfs_expired;
u64 arfs_err;
+#endif
u64 recover;
#ifdef CONFIG_PAGE_POOL_STATS
u64 pp_alloc_fast;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
index 0313432d50a1..234fd4c28e79 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
@@ -16,8 +16,7 @@ mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_i
static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num)
{
- return vport_num == MLX5_VPORT_UPLINK ||
- (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) ||
+ return (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) ||
mlx5_eswitch_is_vf_vport(esw, vport_num) ||
mlx5_core_is_ec_vf_vport(esw->dev, vport_num);
}
@@ -25,7 +24,6 @@ static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_
static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 vport_num)
{
struct mlx5_core_dev *dev = esw->dev;
- struct devlink_port_attrs attrs = {};
struct netdev_phys_item_id ppid = {};
struct devlink_port *dl_port;
u32 controller_num = 0;
@@ -42,13 +40,7 @@ static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16
if (external)
controller_num = dev->priv.eswitch->offloads.host_number + 1;
- if (vport_num == MLX5_VPORT_UPLINK) {
- attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
- attrs.phys.port_number = pfnum;
- memcpy(attrs.switch_id.id, ppid.id, ppid.id_len);
- attrs.switch_id.id_len = ppid.id_len;
- devlink_port_attrs_set(dl_port, &attrs);
- } else if (vport_num == MLX5_VPORT_PF) {
+ if (vport_num == MLX5_VPORT_PF) {
memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len);
dl_port->attrs.switch_id.id_len = ppid.id_len;
devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external);
@@ -71,7 +63,7 @@ static void mlx5_esw_dl_port_free(struct devlink_port *dl_port)
kfree(dl_port);
}
-static const struct devlink_port_ops mlx5_esw_dl_port_ops = {
+static const struct devlink_port_ops mlx5_esw_pf_vf_dl_port_ops = {
.port_fn_hw_addr_get = mlx5_devlink_port_fn_hw_addr_get,
.port_fn_hw_addr_set = mlx5_devlink_port_fn_hw_addr_set,
.port_fn_roce_get = mlx5_devlink_port_fn_roce_get,
@@ -103,7 +95,7 @@ int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_
devlink = priv_to_devlink(dev);
dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num);
err = devl_port_register_with_ops(devlink, dl_port, dl_port_index,
- &mlx5_esw_dl_port_ops);
+ &mlx5_esw_pf_vf_dl_port_ops);
if (err)
goto reg_err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c
index 455746952260..095f31f380fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c
@@ -316,7 +316,7 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev)
err = mlx5_esw_ipsec_modify_flow_dests(esw, flow);
if (err)
mlx5_core_warn_once(mdev,
- "Faided to modify flow dests for IPsec");
+ "Failed to modify flow dests for IPsec");
}
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 46b8c60ac39a..c0b1b7b66cff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2542,11 +2542,9 @@ int mlx5_esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num)
if (esw->mode != MLX5_ESWITCH_OFFLOADS)
return 0;
- if (vport_num != MLX5_VPORT_UPLINK) {
- err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
- if (err)
- return err;
- }
+ err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
+ if (err)
+ return err;
err = mlx5_esw_offloads_rep_load(esw, vport_num);
if (err)
@@ -2554,8 +2552,7 @@ int mlx5_esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num)
return err;
load_err:
- if (vport_num != MLX5_VPORT_UPLINK)
- mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+ mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
return err;
}
@@ -2566,8 +2563,7 @@ void mlx5_esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num)
mlx5_esw_offloads_rep_unload(esw, vport_num);
- if (vport_num != MLX5_VPORT_UPLINK)
- mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+ mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
}
static int esw_set_slave_root_fdb(struct mlx5_core_dev *master,
@@ -3471,7 +3467,7 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN;
/* Uplink vport rep must load first. */
- err = mlx5_esw_offloads_load_rep(esw, MLX5_VPORT_UPLINK);
+ err = mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK);
if (err)
goto err_uplink;
@@ -3482,7 +3478,7 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
return 0;
err_vports:
- mlx5_esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK);
+ mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK);
err_uplink:
esw_offloads_steering_cleanup(esw);
err_steering_init:
@@ -3520,7 +3516,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
void esw_offloads_disable(struct mlx5_eswitch *esw)
{
mlx5_eswitch_disable_pf_vf_vports(esw);
- mlx5_esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK);
+ mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK);
esw_set_passing_vport_metadata(esw, false);
esw_offloads_steering_cleanup(esw);
mapping_destroy(esw->offloads.reg_c0_obj_pool);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 187cb2c464f8..2fb2598b775e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -50,20 +50,6 @@ enum {
};
enum {
- MLX5_HEALTH_SYNDR_FW_ERR = 0x1,
- MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7,
- MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8,
- MLX5_HEALTH_SYNDR_CRC_ERR = 0x9,
- MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa,
- MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb,
- MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc,
- MLX5_HEALTH_SYNDR_EQ_ERR = 0xd,
- MLX5_HEALTH_SYNDR_EQ_INV = 0xe,
- MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf,
- MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10
-};
-
-enum {
MLX5_DROP_HEALTH_WORK,
};
@@ -357,27 +343,27 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
static const char *hsynd_str(u8 synd)
{
switch (synd) {
- case MLX5_HEALTH_SYNDR_FW_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR:
return "firmware internal error";
- case MLX5_HEALTH_SYNDR_IRISC_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC:
return "irisc not responding";
- case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR:
return "unrecoverable hardware error";
- case MLX5_HEALTH_SYNDR_CRC_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR:
return "firmware CRC error";
- case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR:
return "ICM fetch PCI error";
- case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR:
return "HW fatal error\n";
- case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN:
return "async EQ buffer overrun";
- case MLX5_HEALTH_SYNDR_EQ_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR:
return "EQ error";
- case MLX5_HEALTH_SYNDR_EQ_INV:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV:
return "Invalid EQ referenced";
- case MLX5_HEALTH_SYNDR_FFSER_ERR:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR:
return "FFSER error";
- case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+ case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR:
return "High temperature";
default:
return "unrecognized error";
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index feb62d952643..00e67910e3ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -256,14 +256,15 @@ int mlx5_devcom_send_event(struct mlx5_devcom_comp_dev *devcom,
int event, int rollback_event,
void *event_data)
{
- struct mlx5_devcom_comp *comp = devcom->comp;
struct mlx5_devcom_comp_dev *pos;
+ struct mlx5_devcom_comp *comp;
int err = 0;
void *data;
if (IS_ERR_OR_NULL(devcom))
return -ENODEV;
+ comp = devcom->comp;
down_write(&comp->sem);
list_for_each_entry(pos, &comp->comp_dev_list_head, list) {
data = rcu_dereference_protected(pos->data, lockdep_is_held(&comp->sem));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 33a133c9918c..653648216730 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -259,8 +259,11 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
int err;
irq = kzalloc(sizeof(*irq), GFP_KERNEL);
- if (!irq)
+ if (!irq || !zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
+ kfree(irq);
return ERR_PTR(-ENOMEM);
+ }
+
if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
/* The vector at index 0 is always statically allocated. If
* dynamic irq is not supported all vectors are statically
@@ -297,11 +300,7 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
goto err_req_irq;
}
- if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
- mlx5_core_warn(dev, "zalloc_cpumask_var failed\n");
- err = -ENOMEM;
- goto err_cpumask;
- }
+
if (af_desc) {
cpumask_copy(irq->mask, &af_desc->mask);
irq_set_affinity_and_hint(irq->map.virq, irq->mask);
@@ -319,8 +318,6 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
err_xa:
if (af_desc)
irq_update_affinity_hint(irq->map.virq, NULL);
- free_cpumask_var(irq->mask);
-err_cpumask:
free_irq(irq->map.virq, &irq->nh);
err_req_irq:
#ifdef CONFIG_RFS_ACCEL
@@ -333,6 +330,7 @@ err_irq_rmap:
if (i && pci_msix_can_alloc_dyn(dev->pdev))
pci_msix_free_irq(dev->pdev, irq->map);
err_alloc_irq:
+ free_cpumask_var(irq->mask);
kfree(irq);
return ERR_PTR(err);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
index 54bb0866ed72..5b83da08692d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
@@ -1422,7 +1422,6 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn,
case DR_ACTION_TYP_TNL_L3_TO_L2:
{
u8 *hw_actions;
- int ret;
hw_actions = kzalloc(DR_ACTION_CACHE_LINE_SIZE, GFP_KERNEL);
if (!hw_actions)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 6fa06ba2d346..4e8527a724f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -52,6 +52,7 @@ struct dr_qp_init_attr {
u32 cqn;
u32 pdn;
u32 max_send_wr;
+ u32 max_send_sge;
struct mlx5_uars_page *uar;
u8 isolate_vl_tc:1;
};
@@ -246,6 +247,37 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
return err == CQ_POLL_ERR ? err : npolled;
}
+static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr)
+{
+ return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) +
+ sizeof(struct mlx5_wqe_flow_update_ctrl_seg) +
+ sizeof(struct mlx5_wqe_header_modify_argument_update_seg));
+}
+
+/* We calculate for specific RC QP with the required functionality */
+static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr)
+{
+ int update_arg_size;
+ int inl_size = 0;
+ int tot_size;
+ int size;
+
+ update_arg_size = dr_qp_get_args_update_send_wqe_size(attr);
+
+ size = sizeof(struct mlx5_wqe_ctrl_seg) +
+ sizeof(struct mlx5_wqe_raddr_seg);
+ inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) +
+ DR_STE_SIZE, 16);
+
+ size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg);
+
+ size = max(size, update_arg_size);
+
+ tot_size = max(size, inl_size);
+
+ return ALIGN(tot_size, MLX5_SEND_WQE_BB);
+}
+
static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
struct dr_qp_init_attr *attr)
{
@@ -253,6 +285,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
struct mlx5_wq_param wqp;
struct mlx5dr_qp *dr_qp;
+ int wqe_size;
int inlen;
void *qpc;
void *in;
@@ -332,6 +365,15 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
if (err)
goto err_in;
dr_qp->uar = attr->uar;
+ wqe_size = dr_qp_calc_rc_send_wqe(attr);
+ dr_qp->max_inline_data = min(wqe_size -
+ (sizeof(struct mlx5_wqe_ctrl_seg) +
+ sizeof(struct mlx5_wqe_raddr_seg) +
+ sizeof(struct mlx5_wqe_inline_seg)),
+ (2 * MLX5_SEND_WQE_BB -
+ (sizeof(struct mlx5_wqe_ctrl_seg) +
+ sizeof(struct mlx5_wqe_raddr_seg) +
+ sizeof(struct mlx5_wqe_inline_seg))));
return dr_qp;
@@ -395,8 +437,48 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
MLX5_SEND_WQE_DS;
}
+static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp,
+ struct dr_data_seg *data_seg, void *wqe)
+{
+ int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) +
+ sizeof(struct mlx5_wqe_raddr_seg) +
+ sizeof(struct mlx5_wqe_inline_seg);
+ struct mlx5_wqe_inline_seg *seg;
+ int left_space;
+ int inl = 0;
+ void *addr;
+ int len;
+ int idx;
+
+ seg = wqe;
+ wqe += sizeof(*seg);
+ addr = (void *)(unsigned long)(data_seg->addr);
+ len = data_seg->length;
+ inl += len;
+ left_space = MLX5_SEND_WQE_BB - inline_header_size;
+
+ if (likely(len > left_space)) {
+ memcpy(wqe, addr, left_space);
+ len -= left_space;
+ addr += left_space;
+ idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1);
+ wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
+ }
+
+ memcpy(wqe, addr, len);
+
+ if (likely(inl)) {
+ seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+ return DIV_ROUND_UP(inl + sizeof(seg->byte_count),
+ MLX5_SEND_WQE_DS);
+ } else {
+ return 0;
+ }
+}
+
static void
-dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
+dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp,
+ struct mlx5_wqe_ctrl_seg *wq_ctrl,
u64 remote_addr,
u32 rkey,
struct dr_data_seg *data_seg,
@@ -412,15 +494,17 @@ dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
wq_raddr->reserved = 0;
wq_dseg = (void *)(wq_raddr + 1);
+ /* WQE ctrl segment + WQE remote addr segment */
+ *size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS;
- wq_dseg->byte_count = cpu_to_be32(data_seg->length);
- wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
- wq_dseg->addr = cpu_to_be64(data_seg->addr);
-
- *size = (sizeof(*wq_ctrl) + /* WQE ctrl segment */
- sizeof(*wq_dseg) + /* WQE data segment */
- sizeof(*wq_raddr)) / /* WQE remote addr segment */
- MLX5_SEND_WQE_DS;
+ if (data_seg->send_flags & IB_SEND_INLINE) {
+ *size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg);
+ } else {
+ wq_dseg->byte_count = cpu_to_be32(data_seg->length);
+ wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
+ wq_dseg->addr = cpu_to_be64(data_seg->addr);
+ *size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS; /* WQE data segment */
+ }
}
static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl,
@@ -451,7 +535,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
switch (opcode) {
case MLX5_OPCODE_RDMA_READ:
case MLX5_OPCODE_RDMA_WRITE:
- dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr,
+ dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr,
rkey, data_seg, &size);
break;
case MLX5_OPCODE_FLOW_TBL_ACCESS:
@@ -572,7 +656,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring,
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->write.send_flags |= IB_SEND_SIGNALED;
else
- send_info->write.send_flags = 0;
+ send_info->write.send_flags &= ~IB_SEND_SIGNALED;
}
static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
@@ -596,9 +680,13 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
}
send_ring->pending_wqe++;
+ if (!send_info->write.lkey)
+ send_info->write.send_flags |= IB_SEND_INLINE;
if (send_ring->pending_wqe % send_ring->signal_th == 0)
send_info->write.send_flags |= IB_SEND_SIGNALED;
+ else
+ send_info->write.send_flags &= ~IB_SEND_SIGNALED;
send_ring->pending_wqe++;
send_info->read.length = send_info->write.length;
@@ -608,9 +696,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
send_info->read.lkey = send_ring->sync_mr->mkey;
if (send_ring->pending_wqe % send_ring->signal_th == 0)
- send_info->read.send_flags = IB_SEND_SIGNALED;
+ send_info->read.send_flags |= IB_SEND_SIGNALED;
else
- send_info->read.send_flags = 0;
+ send_info->read.send_flags &= ~IB_SEND_SIGNALED;
}
static void dr_fill_data_segs(struct mlx5dr_domain *dmn,
@@ -1257,6 +1345,7 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
dmn->send_ring->cq->qp = dmn->send_ring->qp;
dmn->info.max_send_wr = QUEUE_SIZE;
+ init_attr.max_send_sge = 1;
dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
DR_STE_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
index feb307fb3440..14f6df88b1f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c
@@ -336,7 +336,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns,
if (fte->action.pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) {
err = -EINVAL;
mlx5dr_err(domain, "FW-owned reformat can't be used in SW rule\n");
- goto free_actions;
+ goto free_actions;
}
is_decap = fte->action.pkt_reformat->reformat_type ==
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
index 89a9a7afa32c..6f565c0c0c3d 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
@@ -414,7 +414,6 @@ enum sparx5_pgid_type {
};
void sparx5_pgid_init(struct sparx5 *spx5);
-int sparx5_pgid_alloc_glag(struct sparx5 *spx5, u16 *idx);
int sparx5_pgid_alloc_mcast(struct sparx5 *spx5, u16 *idx);
int sparx5_pgid_free(struct sparx5 *spx5, u16 idx);
diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api.h b/drivers/net/ethernet/microchip/vcap/vcap_api.h
index 62db270f65af..9eccfa633c1a 100644
--- a/drivers/net/ethernet/microchip/vcap/vcap_api.h
+++ b/drivers/net/ethernet/microchip/vcap/vcap_api.h
@@ -277,7 +277,4 @@ struct vcap_control {
struct list_head list; /* list of vcap instances */
};
-/* Set client control interface on the API */
-int vcap_api_set_client(struct vcap_control *vctrl);
-
#endif /* __VCAP_API__ */
diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_client.h b/drivers/net/ethernet/microchip/vcap/vcap_api_client.h
index d9d1f7c9d762..88641508f885 100644
--- a/drivers/net/ethernet/microchip/vcap/vcap_api_client.h
+++ b/drivers/net/ethernet/microchip/vcap/vcap_api_client.h
@@ -226,9 +226,6 @@ int vcap_chain_offset(struct vcap_control *vctrl, int from_cid, int to_cid);
bool vcap_is_next_lookup(struct vcap_control *vctrl, int cur_cid, int next_cid);
/* Is this chain id the last lookup of all VCAPs */
bool vcap_is_last_chain(struct vcap_control *vctrl, int cid, bool ingress);
-/* Provide all rules via a callback interface */
-int vcap_rule_iter(struct vcap_control *vctrl,
- int (*callback)(void *, struct vcap_rule *), void *arg);
/* Match a list of keys against the keysets available in a vcap type */
bool vcap_rule_find_keysets(struct vcap_rule *rule,
struct vcap_keyset_list *matches);
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 87f2055c242c..e50be508c166 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -97,8 +97,6 @@ int ocelot_netdev_to_port(struct net_device *dev);
int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target,
struct device_node *portnp);
void ocelot_release_port(struct ocelot_port *ocelot_port);
-int ocelot_devlink_init(struct ocelot *ocelot);
-void ocelot_devlink_teardown(struct ocelot *ocelot);
int ocelot_port_devlink_init(struct ocelot *ocelot, int port,
enum devlink_port_flavour flavour);
void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port);
diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.h b/drivers/net/ethernet/mscc/ocelot_vcap.h
index 523611ccc48f..6f546695faa5 100644
--- a/drivers/net/ethernet/mscc/ocelot_vcap.h
+++ b/drivers/net/ethernet/mscc/ocelot_vcap.h
@@ -15,7 +15,6 @@
int ocelot_vcap_filter_stats_update(struct ocelot *ocelot,
struct ocelot_vcap_filter *rule);
-void ocelot_detect_vcap_constants(struct ocelot *ocelot);
int ocelot_vcap_init(struct ocelot *ocelot);
int ocelot_setup_tc_cls_flower(struct ocelot_port_private *priv,
diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h
index 602f4d45d529..2453a40f6ee8 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic.h
@@ -81,7 +81,6 @@ int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_wait);
int ionic_dev_cmd_wait_nomsg(struct ionic *ionic, unsigned long max_wait);
void ionic_dev_cmd_dev_err_print(struct ionic *ionic, u8 opcode, u8 status,
int err);
-int ionic_set_dma_mask(struct ionic *ionic);
int ionic_setup(struct ionic *ionic);
int ionic_identify(struct ionic *ionic);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
index 0bea208bfba2..6aac98bcb9f4 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
@@ -376,7 +376,6 @@ void ionic_q_cmb_map(struct ionic_queue *q, void __iomem *base, dma_addr_t base_
void ionic_q_sg_map(struct ionic_queue *q, void *base, dma_addr_t base_pa);
void ionic_q_post(struct ionic_queue *q, bool ring_doorbell, ionic_desc_cb cb,
void *cb_arg);
-void ionic_q_rewind(struct ionic_queue *q, struct ionic_desc_info *start);
void ionic_q_service(struct ionic_queue *q, struct ionic_cq_info *cq_info,
unsigned int stop_index);
int ionic_heartbeat_check(struct ionic *ionic);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
index 87b2666f248b..ee9e99cd1b5e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h
@@ -43,7 +43,6 @@ struct ionic_rx_filter *ionic_rx_filter_by_addr(struct ionic_lif *lif, const u8
struct ionic_rx_filter *ionic_rx_filter_rxsteer(struct ionic_lif *lif);
void ionic_rx_filter_sync(struct ionic_lif *lif);
int ionic_lif_list_addr(struct ionic_lif *lif, const u8 *addr, bool mode);
-int ionic_rx_filters_need_sync(struct ionic_lif *lif);
int ionic_lif_vlan_add(struct ionic_lif *lif, const u16 vid);
int ionic_lif_vlan_del(struct ionic_lif *lif, const u16 vid);
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
index bf40c88fbd9b..f3dad2ab9828 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
@@ -192,7 +192,6 @@ struct am65_cpsw_ndev_priv {
extern const struct ethtool_ops am65_cpsw_ethtool_ops_slave;
-void am65_cpsw_nuss_adjust_link(struct net_device *ndev);
void am65_cpsw_nuss_set_p0_ptype(struct am65_cpsw_common *common);
void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common);
int am65_cpsw_nuss_update_tx_chns(struct am65_cpsw_common *common, int num_tx);
diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index 43d5cd59b56b..7007eb8bed36 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -233,8 +233,6 @@ int netcp_register_rxhook(struct netcp_intf *netcp_priv, int order,
netcp_hook_rtn *hook_rtn, void *hook_data);
int netcp_unregister_rxhook(struct netcp_intf *netcp_priv, int order,
netcp_hook_rtn *hook_rtn, void *hook_data);
-void *netcp_device_find_module(struct netcp_device *netcp_device,
- const char *name);
/* SGMII functions */
int netcp_sgmii_reset(void __iomem *sgmii_ofs, int port);
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 6043e63b42f9..43f374444684 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -638,9 +638,7 @@ static void vrf_finish_direct(struct sk_buff *skb)
eth_zero_addr(eth->h_dest);
eth->h_proto = skb->protocol;
- rcu_read_lock_bh();
dev_queue_xmit_nit(skb, vrf_dev);
- rcu_read_unlock_bh();
skb_pull(skb, ETH_HLEN);
}
diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
index c3ff30ab782e..9c59d0bf8c3d 100644
--- a/drivers/net/vxlan/vxlan_vnifilter.c
+++ b/drivers/net/vxlan/vxlan_vnifilter.c
@@ -696,7 +696,7 @@ static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan,
{
struct vxlan_vni_node *vninode;
- vninode = kzalloc(sizeof(*vninode), GFP_ATOMIC);
+ vninode = kzalloc(sizeof(*vninode), GFP_KERNEL);
if (!vninode)
return NULL;
vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu);
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 3c5c68618fcc..fb996124b3d5 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -96,6 +96,27 @@ struct mptcp_out_options {
#endif
};
+#define MPTCP_SCHED_NAME_MAX 16
+#define MPTCP_SUBFLOWS_MAX 8
+
+struct mptcp_sched_data {
+ bool reinject;
+ u8 subflows;
+ struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
+};
+
+struct mptcp_sched_ops {
+ int (*get_subflow)(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data);
+
+ char name[MPTCP_SCHED_NAME_MAX];
+ struct module *owner;
+ struct list_head list;
+
+ void (*init)(struct mptcp_sock *msk);
+ void (*release)(struct mptcp_sock *msk);
+} ____cacheline_aligned_in_smp;
+
#ifdef CONFIG_MPTCP
void mptcp_init(void);
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index a8c2817335b9..1e1b40f4e664 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1165,7 +1165,6 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
struct ethtool_mm_state *state);
int ocelot_port_mqprio(struct ocelot *ocelot, int port,
struct tc_mqprio_qopt_offload *mqprio);
-void ocelot_port_update_preemptible_tcs(struct ocelot *ocelot, int port);
#if IS_ENABLED(CONFIG_BRIDGE_MRP)
int ocelot_mrp_add(struct ocelot *ocelot, int port,
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index a3829ce548f9..84e531f86b82 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
- mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
+ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index ae20b7d92e28..c46c22a84d23 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -32,6 +32,7 @@ struct mptcp_pernet {
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
+ char scheduler[MPTCP_SCHED_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type;
}
+const char *mptcp_get_scheduler(const struct net *net)
+{
+ return mptcp_get_pernet(net)->scheduler;
+}
+
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
+ strcpy(pernet->scheduler, "default");
}
#ifdef CONFIG_SYSCTL
@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
+ {
+ .procname = "scheduler",
+ .maxlen = MPTCP_SCHED_NAME_MAX,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
{}
};
@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
+ table[6].data = &pernet->scheduler;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 7dbbad1e4f55..d8da5374d9e1 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
msk = mptcp_sk(sk);
- if (subflow->backup != bkup) {
+ if (subflow->backup != bkup)
subflow->backup = bkup;
- mptcp_data_lock(sk);
- if (!sock_owned_by_user(sk))
- msk->last_snd = NULL;
- else
- __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
- mptcp_data_unlock(sk);
- }
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index c75d9d88a053..9661f3812682 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
slow = lock_sock_fast(ssk);
if (prio) {
- if (subflow->backup != backup)
- msk->last_snd = NULL;
-
subflow->send_mp_prio = 1;
subflow->backup = backup;
subflow->request_bkup = backup;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6019a3cf1625..933b257eee02 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
* returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout
*/
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow;
@@ -1377,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
u64 linger_time;
long tout = 0;
- msk_owned_by_me(msk);
-
- if (__mptcp_check_fallback(msk)) {
- if (!msk->first)
- return NULL;
- return __tcp_can_send(msk->first) &&
- sk_stream_memory_free(msk->first) ? msk->first : NULL;
- }
-
- /* re-use last subflow, if the burst allow that */
- if (msk->last_snd && msk->snd_burst > 0 &&
- sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
- mptcp_set_timeout(sk);
- return msk->last_snd;
- }
-
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL;
@@ -1446,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
- if (!burst) {
- msk->last_snd = NULL;
+ if (!burst)
return ssk;
- }
subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem);
- msk->last_snd = ssk;
msk->snd_burst = burst;
return ssk;
}
@@ -1499,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk)
mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
}
-void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
+ struct mptcp_sendmsg_info *info)
{
- struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_sendmsg_info info = {
- .flags = flags,
- };
- bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag;
- int len;
+ int len, copied = 0, err = 0;
while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
+ info->sent = dfrag->already_sent;
+ info->limit = dfrag->data_len;
len = dfrag->data_len - dfrag->already_sent;
while (len > 0) {
int ret = 0;
- prev_ssk = ssk;
- ssk = mptcp_subflow_get_send(msk);
-
- /* First check. If the ssk has changed since
- * the last round, release prev_ssk
- */
- if (ssk != prev_ssk && prev_ssk)
- mptcp_push_release(prev_ssk, &info);
- if (!ssk)
- goto out;
-
- /* Need to lock the new subflow only if different
- * from the previous one, otherwise we are still
- * helding the relevant lock
- */
- if (ssk != prev_ssk)
- lock_sock(ssk);
-
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
if (ret <= 0) {
- if (ret == -EAGAIN)
- continue;
- mptcp_push_release(ssk, &info);
+ err = copied ? : ret;
goto out;
}
- do_check_data_fin = true;
- info.sent += ret;
+ info->sent += ret;
+ copied += ret;
len -= ret;
mptcp_update_post_push(msk, dfrag, ret);
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+
+ if (msk->snd_burst <= 0 ||
+ !sk_stream_memory_free(ssk) ||
+ !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
+ err = copied;
+ goto out;
+ }
+ mptcp_set_timeout(sk);
+ }
+ err = copied;
+
+out:
+ return err;
+}
+
+void __mptcp_push_pending(struct sock *sk, unsigned int flags)
+{
+ struct sock *prev_ssk = NULL, *ssk = NULL;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_sendmsg_info info = {
+ .flags = flags,
+ };
+ bool do_check_data_fin = false;
+ int push_count = 1;
+
+ while (mptcp_send_head(sk) && (push_count > 0)) {
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ if (mptcp_sched_get_send(msk))
+ break;
+
+ push_count = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+
+ prev_ssk = ssk;
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ if (ssk != prev_ssk) {
+ /* First check. If the ssk has changed since
+ * the last round, release prev_ssk
+ */
+ if (prev_ssk)
+ mptcp_push_release(prev_ssk, &info);
+
+ /* Need to lock the new subflow only if different
+ * from the previous one, otherwise we are still
+ * helding the relevant lock
+ */
+ lock_sock(ssk);
+ }
+
+ push_count++;
+
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0) {
+ if (ret != -EAGAIN ||
+ (1 << ssk->sk_state) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
+ push_count--;
+ continue;
+ }
+ do_check_data_fin = true;
+ }
+ }
}
/* at this point we held the socket lock for the last subflow we used */
if (ssk)
mptcp_push_release(ssk, &info);
-out:
/* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
@@ -1570,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
struct mptcp_sendmsg_info info = {
.data_lock_held = true,
};
- struct mptcp_data_frag *dfrag;
+ bool keep_pushing = true;
struct sock *xmit_ssk;
- int len, copied = 0;
+ int copied = 0;
info.flags = 0;
- while ((dfrag = mptcp_send_head(sk))) {
- info.sent = dfrag->already_sent;
- info.limit = dfrag->data_len;
- len = dfrag->data_len - dfrag->already_sent;
- while (len > 0) {
- int ret = 0;
-
- /* check for a different subflow usage only after
- * spooling the first chunk of data
- */
- xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
- if (!xmit_ssk)
- goto out;
- if (xmit_ssk != ssk) {
- mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
- MPTCP_DELEGATE_SEND);
- goto out;
- }
+ while (mptcp_send_head(sk) && keep_pushing) {
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ int ret = 0;
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ /* check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ if (first) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ first = false;
if (ret <= 0)
- goto out;
+ break;
+ copied += ret;
+ continue;
+ }
+
+ if (mptcp_sched_get_send(msk))
+ goto out;
- info.sent += ret;
+ if (READ_ONCE(subflow->scheduled)) {
+ mptcp_subflow_set_scheduled(subflow, false);
+ ret = __subflow_push_pending(sk, ssk, &info);
+ if (ret <= 0)
+ keep_pushing = false;
copied += ret;
- len -= ret;
- first = false;
+ }
- mptcp_update_post_push(msk, dfrag, ret);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ xmit_ssk = mptcp_subflow_tcp_sock(subflow);
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(subflow,
+ MPTCP_DELEGATE_SEND);
+ keep_pushing = false;
+ }
+ }
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
out:
@@ -2198,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t)
*
* A backup subflow is returned only if that is the only kind available.
*/
-static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow;
int min_stale_count = INT_MAX;
- msk_owned_by_me(msk);
-
- if (__mptcp_check_fallback(msk))
- return NULL;
-
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2370,9 +2394,6 @@ out_release:
WRITE_ONCE(msk->first, NULL);
out:
- if (ssk == msk->last_snd)
- msk->last_snd = NULL;
-
if (need_push)
__mptcp_push_pending(sk, 0);
}
@@ -2489,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
static void __mptcp_retrans(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
struct mptcp_sendmsg_info info = {};
struct mptcp_data_frag *dfrag;
- size_t copied = 0;
struct sock *ssk;
- int ret;
+ int ret, err;
+ u16 len = 0;
mptcp_clean_una_wakeup(sk);
/* first check ssk: need to kick "stale" logic */
- ssk = mptcp_subflow_get_retrans(msk);
+ err = mptcp_sched_get_retrans(msk);
dfrag = mptcp_rtx_head(sk);
if (!dfrag) {
if (mptcp_data_fin_enabled(msk)) {
@@ -2517,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk)
goto reset_timer;
}
- if (!ssk)
+ if (err)
goto reset_timer;
- lock_sock(ssk);
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled)) {
+ u16 copied = 0;
- /* limit retransmission to the bytes already sent on some subflows */
- info.sent = 0;
- info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
- while (info.sent < info.limit) {
- ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
- if (ret <= 0)
- break;
+ mptcp_subflow_set_scheduled(subflow, false);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
- copied += ret;
- info.sent += ret;
- }
- if (copied) {
- dfrag->already_sent = max(dfrag->already_sent, info.sent);
- msk->bytes_retrans += copied;
- tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
- info.size_goal);
- WRITE_ONCE(msk->allow_infinite_fallback, false);
+ ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* limit retransmission to the bytes already sent on some subflows */
+ info.sent = 0;
+ info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
+ dfrag->already_sent;
+ while (info.sent < info.limit) {
+ ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
+ if (ret <= 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ info.sent += ret;
+ }
+ if (copied) {
+ len = max(copied, len);
+ tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
+ info.size_goal);
+ WRITE_ONCE(msk->allow_infinite_fallback, false);
+ }
+
+ release_sock(ssk);
+ }
}
- release_sock(ssk);
+ msk->bytes_retrans += len;
+ dfrag->already_sent = max(dfrag->already_sent, len);
reset_timer:
mptcp_check_and_set_pending(sk);
@@ -2694,6 +2729,7 @@ static void mptcp_ca_reset(struct sock *sk)
static int mptcp_init_sock(struct sock *sk)
{
struct net *net = sock_net(sk);
+ int ret;
__mptcp_init_sock(sk);
@@ -2703,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM;
+ ret = mptcp_init_sched(mptcp_sk(sk),
+ mptcp_sched_find(mptcp_get_scheduler(net)));
+ if (ret)
+ return ret;
+
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
@@ -2848,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0;
+ mptcp_release_sched(msk);
sk->sk_prot->destroy(sk);
@@ -3037,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
* subflow
*/
mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
- msk->last_snd = NULL;
WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0;
msk->push_pending = 0;
@@ -3103,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
+ mptcp_init_sched(msk, mptcp_sk(sk)->sched);
/* passive msk is created after the first/MPC subflow */
msk->subflow_id = 2;
@@ -3307,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk);
- if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
- msk->last_snd = NULL;
}
__mptcp_update_rmem(sk);
@@ -3925,6 +3965,7 @@ void __init mptcp_proto_init(void)
mptcp_subflow_init();
mptcp_pm_init();
+ mptcp_sched_init();
mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 38c7ea013361..7254b3562575 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -123,7 +123,6 @@
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
-#define MPTCP_RESET_SCHEDULER 7
struct mptcp_skb_cb {
u64 map_seq;
@@ -269,7 +268,6 @@ struct mptcp_sock {
u64 rcv_data_fin_seq;
u64 bytes_retrans;
int rmem_fwd_alloc;
- struct sock *last_snd;
int snd_burst;
int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
@@ -314,6 +312,7 @@ struct mptcp_sock {
* lock as such sock is freed after close().
*/
struct mptcp_pm_data pm;
+ struct mptcp_sched_ops *sched;
struct {
u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */
@@ -492,6 +491,7 @@ struct mptcp_subflow_context {
is_mptfo : 1, /* subflow is doing TFO */
__unused : 9;
enum mptcp_data_avail data_avail;
+ bool scheduled;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
+const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
@@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
+struct mptcp_sched_ops *mptcp_sched_find(const char *name);
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_sched_init(void);
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched);
+void mptcp_release_sched(struct mptcp_sock *msk);
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled);
+struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
+struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
+int mptcp_sched_get_send(struct mptcp_sock *msk);
+int mptcp_sched_get_retrans(struct mptcp_sock *msk);
static inline bool __tcp_can_send(const struct sock *ssk)
{
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
new file mode 100644
index 000000000000..4ab0693c069c
--- /dev/null
+++ b/net/mptcp/sched.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2022, SUSE.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include "protocol.h"
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk,
+ struct mptcp_sched_data *data)
+{
+ struct sock *ssk;
+
+ ssk = data->reinject ? mptcp_subflow_get_retrans(msk) :
+ mptcp_subflow_get_send(msk);
+ if (!ssk)
+ return -EINVAL;
+
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
+ return 0;
+}
+
+static struct mptcp_sched_ops mptcp_sched_default = {
+ .get_subflow = mptcp_sched_default_get_subflow,
+ .name = "default",
+ .owner = THIS_MODULE,
+};
+
+/* Must be called with rcu read lock held */
+struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+ struct mptcp_sched_ops *sched, *ret = NULL;
+
+ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+ if (!strcmp(sched->name, name)) {
+ ret = sched;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (!sched->get_subflow)
+ return -EINVAL;
+
+ spin_lock(&mptcp_sched_list_lock);
+ if (mptcp_sched_find(sched->name)) {
+ spin_unlock(&mptcp_sched_list_lock);
+ return -EEXIST;
+ }
+ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+ spin_unlock(&mptcp_sched_list_lock);
+
+ pr_debug("%s registered", sched->name);
+ return 0;
+}
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+ if (sched == &mptcp_sched_default)
+ return;
+
+ spin_lock(&mptcp_sched_list_lock);
+ list_del_rcu(&sched->list);
+ spin_unlock(&mptcp_sched_list_lock);
+}
+
+void mptcp_sched_init(void)
+{
+ mptcp_register_scheduler(&mptcp_sched_default);
+}
+
+int mptcp_init_sched(struct mptcp_sock *msk,
+ struct mptcp_sched_ops *sched)
+{
+ if (!sched)
+ sched = &mptcp_sched_default;
+
+ if (!bpf_try_module_get(sched, sched->owner))
+ return -EBUSY;
+
+ msk->sched = sched;
+ if (msk->sched->init)
+ msk->sched->init(msk);
+
+ pr_debug("sched=%s", msk->sched->name);
+
+ return 0;
+}
+
+void mptcp_release_sched(struct mptcp_sock *msk)
+{
+ struct mptcp_sched_ops *sched = msk->sched;
+
+ if (!sched)
+ return;
+
+ msk->sched = NULL;
+ if (sched->release)
+ sched->release(msk);
+
+ bpf_module_put(sched, sched->owner);
+}
+
+void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
+ bool scheduled)
+{
+ WRITE_ONCE(subflow->scheduled, scheduled);
+}
+
+int mptcp_sched_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_send */
+ if (__mptcp_check_fallback(msk)) {
+ if (msk->first &&
+ __tcp_can_send(msk->first) &&
+ sk_stream_memory_free(msk->first)) {
+ mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = false;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}
+
+int mptcp_sched_get_retrans(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sched_data data;
+
+ msk_owned_by_me(msk);
+
+ /* the following check is moved out of mptcp_subflow_get_retrans */
+ if (__mptcp_check_fallback(msk))
+ return -EINVAL;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (READ_ONCE(subflow->scheduled))
+ return 0;
+ }
+
+ data.reinject = true;
+ if (msk->sched == &mptcp_sched_default || !msk->sched)
+ return mptcp_sched_default_get_subflow(msk, &data);
+ return msk->sched->get_subflow(msk, &data);
+}