diff options
40 files changed, 603 insertions, 251 deletions
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst index 008e560e12b5..f69ee1ebee01 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst @@ -346,6 +346,24 @@ the software port. - The number of receive packets with CQE compression on ring i [#accel]_. - Acceleration + * - `rx[i]_arfs_add` + - The number of aRFS flow rules added to the device for direct RQ steering + on ring i [#accel]_. + - Acceleration + + * - `rx[i]_arfs_request_in` + - Number of flow rules that have been requested to move into ring i for + direct RQ steering [#accel]_. + - Acceleration + + * - `rx[i]_arfs_request_out` + - Number of flow rules that have been requested to move out of ring i [#accel]_. + - Acceleration + + * - `rx[i]_arfs_expired` + - Number of flow rules that have been expired and removed [#accel]_. + - Acceleration + * - `rx[i]_arfs_err` - Number of flow rules that failed to be added to the flow table. - Error @@ -445,11 +463,6 @@ the software port. context. - Error - * - `rx[i]_xsk_arfs_err` - - aRFS (accelerated Receive Flow Steering) does not occur in the XSK RQ - context, so this counter should never increment. - - Error - * - `rx[i]_xdp_tx_xmit` - The number of packets forwarded back to the port due to XDP program `XDP_TX` action (bouncing). these packets are not counted by other diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst index 43b1f7e87ec4..0a42c3395ffa 100644 --- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst +++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst @@ -36,7 +36,7 @@ Enabling the driver and kconfig options **CONFIG_MLX5_CORE_EN_DCB=(y/n)**: -| Enables `Data Center Bridging (DCB) Support <https://community.mellanox.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_. +| Enables `Data Center Bridging (DCB) Support <https://enterprise-support.nvidia.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_. **CONFIG_MLX5_CORE_IPOIB=(y/n)** @@ -59,12 +59,12 @@ Enabling the driver and kconfig options **CONFIG_MLX5_EN_ARFS=(y/n)** | Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering. -| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4 +| https://enterprise-support.nvidia.com/s/article/howto-configure-arfs-on-connectx-4 **CONFIG_MLX5_EN_IPSEC=(y/n)** -| Enables `IPSec XFRM cryptography-offload acceleration <https://support.mellanox.com/s/article/ConnectX-6DX-Bluefield-2-IPsec-HW-Full-Offload-Configuration-Guide>`_. +| Enables :ref:`IPSec XFRM cryptography-offload acceleration <xfrm_device>`. **CONFIG_MLX5_EN_MACSEC=(y/n)** @@ -87,8 +87,8 @@ Enabling the driver and kconfig options | Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering | and switching for the enabled VFs and PF in two available modes: -| 1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://community.mellanox.com/s/article/howto-configure-sr-iov-for-connectx-4-connectx-5-with-kvm--ethernet-x>`_. -| 2) `Switchdev mode (eswitch offloads) <https://www.mellanox.com/related-docs/prod_software/ASAP2_Hardware_Offloading_for_vSwitches_User_Manual_v4.4.pdf>`_. +| 1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://enterprise-support.nvidia.com/s/article/HowTo-Configure-SR-IOV-for-ConnectX-4-ConnectX-5-ConnectX-6-with-KVM-Ethernet>`_. +| 2) :ref:`Switchdev mode (eswitch offloads) <switchdev>`. **CONFIG_MLX5_FPGA=(y/n)** @@ -101,13 +101,13 @@ Enabling the driver and kconfig options **CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko) -| Provides low-level InfiniBand/RDMA and `RoCE <https://community.mellanox.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support. +| Provides low-level InfiniBand/RDMA and `RoCE <https://enterprise-support.nvidia.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support. **CONFIG_MLX5_MPFS=(y/n)** | Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC. -| MPFs is required for when `Multi-Host <http://www.mellanox.com/page/multihost>`_ configuration is enabled to allow passing +| MPFs is required for when `Multi-Host <https://www.nvidia.com/en-us/networking/multi-host/>`_ configuration is enabled to allow passing | user configured unicast MAC addresses to the requesting PF. diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 213510698014..15f1919d640c 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER This is a per-namespace sysctl. Default: 4 + +scheduler - STRING + Select the scheduler of your choice. + + Support for selection of different schedulers. This is a per-namespace + sysctl. + + Default: "default" diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst index 83abdfef4ec3..535077cbeb07 100644 --- a/Documentation/networking/xfrm_device.rst +++ b/Documentation/networking/xfrm_device.rst @@ -1,4 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _xfrm_device: =============================================== XFRM device - offloading the IPsec computations diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index f398bec78457..ed7212e61c54 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5718,6 +5718,7 @@ static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { + bond_update_speed_duplex(slave); if (slave->speed != SPEED_UNKNOWN) { if (BOND_MODE(bond) == BOND_MODE_BROADCAST) speed = bond_mode_bcast_speed(slave, diff --git a/drivers/net/dsa/microchip/ksz8.h b/drivers/net/dsa/microchip/ksz8.h index e68465fdf6b9..4cea811e73ac 100644 --- a/drivers/net/dsa/microchip/ksz8.h +++ b/drivers/net/dsa/microchip/ksz8.h @@ -48,13 +48,11 @@ int ksz8_port_mirror_add(struct ksz_device *dev, int port, bool ingress, struct netlink_ext_ack *extack); void ksz8_port_mirror_del(struct ksz_device *dev, int port, struct dsa_mall_mirror_tc_entry *mirror); -int ksz8_get_stp_reg(void); void ksz8_get_caps(struct ksz_device *dev, int port, struct phylink_config *config); void ksz8_config_cpu_port(struct dsa_switch *ds); int ksz8_enable_stp_addr(struct ksz_device *dev); int ksz8_reset_switch(struct ksz_device *dev); -int ksz8_switch_detect(struct ksz_device *dev); int ksz8_switch_init(struct ksz_device *dev); void ksz8_switch_exit(struct ksz_device *dev); int ksz8_change_mtu(struct ksz_device *dev, int port, int mtu); diff --git a/drivers/net/dsa/microchip/ksz9477.h b/drivers/net/dsa/microchip/ksz9477.h index b6f7e3c46e3f..a6f425866a29 100644 --- a/drivers/net/dsa/microchip/ksz9477.h +++ b/drivers/net/dsa/microchip/ksz9477.h @@ -36,7 +36,6 @@ int ksz9477_port_mirror_add(struct ksz_device *dev, int port, bool ingress, struct netlink_ext_ack *extack); void ksz9477_port_mirror_del(struct ksz_device *dev, int port, struct dsa_mall_mirror_tc_entry *mirror); -int ksz9477_get_stp_reg(void); void ksz9477_get_caps(struct ksz_device *dev, int port, struct phylink_config *config); int ksz9477_fdb_dump(struct ksz_device *dev, int port, @@ -54,7 +53,6 @@ void ksz9477_config_cpu_port(struct dsa_switch *ds); int ksz9477_tc_cbs_set_cinc(struct ksz_device *dev, int port, u32 val); int ksz9477_enable_stp_addr(struct ksz_device *dev); int ksz9477_reset_switch(struct ksz_device *dev); -int ksz9477_dsa_init(struct ksz_device *dev); int ksz9477_switch_init(struct ksz_device *dev); void ksz9477_switch_exit(struct ksz_device *dev); void ksz9477_port_queue_split(struct ksz_device *dev, int port); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c index 5aa51d74f8b4..bb7f86c993e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c @@ -432,8 +432,10 @@ static void arfs_may_expire_flow(struct mlx5e_priv *priv) } spin_unlock_bh(&arfs->arfs_lock); hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) { - if (arfs_rule->rule) + if (arfs_rule->rule) { mlx5_del_flow_rules(arfs_rule->rule); + priv->channel_stats[arfs_rule->rxq]->rq.arfs_expired++; + } hlist_del(&arfs_rule->hlist); kfree(arfs_rule); } @@ -509,6 +511,7 @@ static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv, spec = kvzalloc(sizeof(*spec), GFP_KERNEL); if (!spec) { + priv->channel_stats[arfs_rule->rxq]->rq.arfs_err++; err = -ENOMEM; goto out; } @@ -519,6 +522,8 @@ static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv, ntohs(tuple->etype)); arfs_table = arfs_get_table(arfs, tuple->ip_proto, tuple->etype); if (!arfs_table) { + WARN_ONCE(1, "arfs table does not exist for etype %u and ip_proto %u\n", + tuple->etype, tuple->ip_proto); err = -EINVAL; goto out; } @@ -600,9 +605,11 @@ static void arfs_modify_rule_rq(struct mlx5e_priv *priv, dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR; dst.tir_num = mlx5e_rx_res_get_tirn_direct(priv->rx_res, rxq); err = mlx5_modify_rule_destination(rule, &dst, NULL); - if (err) + if (err) { + priv->channel_stats[rxq]->rq.arfs_err++; netdev_warn(priv->netdev, "Failed to modify aRFS rule destination to rq=%d\n", rxq); + } } static void arfs_handle_work(struct work_struct *work) @@ -632,6 +639,7 @@ static void arfs_handle_work(struct work_struct *work) if (IS_ERR(rule)) goto out; arfs_rule->rule = rule; + priv->channel_stats[arfs_rule->rxq]->rq.arfs_add++; } else { arfs_modify_rule_rq(priv, arfs_rule->rule, arfs_rule->rxq); @@ -650,8 +658,10 @@ static struct arfs_rule *arfs_alloc_rule(struct mlx5e_priv *priv, struct arfs_tuple *tuple; rule = kzalloc(sizeof(*rule), GFP_ATOMIC); - if (!rule) + if (!rule) { + priv->channel_stats[rxq]->rq.arfs_err++; return NULL; + } rule->priv = priv; rule->rxq = rxq; @@ -740,10 +750,13 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, spin_lock_bh(&arfs->arfs_lock); arfs_rule = arfs_find_rule(arfs_t, &fk); if (arfs_rule) { - if (arfs_rule->rxq == rxq_index) { + if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) { spin_unlock_bh(&arfs->arfs_lock); return arfs_rule->filter_id; } + + priv->channel_stats[rxq_index]->rq.arfs_request_in++; + priv->channel_stats[arfs_rule->rxq]->rq.arfs_request_out++; arfs_rule->rxq = rxq_index; } else { arfs_rule = arfs_alloc_rule(priv, arfs_t, &fk, rxq_index, flow_id); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 52141729444e..348fb140858c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -180,7 +180,13 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) }, +#ifdef CONFIG_MLX5_EN_ARFS + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_add) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_request_in) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_request_out) }, + { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_expired) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_arfs_err) }, +#endif { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_recover) }, #ifdef CONFIG_PAGE_POOL_STATS { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_pp_alloc_fast) }, @@ -231,7 +237,6 @@ static const struct counter_desc sw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_blks) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_pkts) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_congst_umr) }, - { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_arfs_err) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_xmit) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_mpwqe) }, { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_inlnw) }, @@ -321,7 +326,6 @@ static void mlx5e_stats_grp_sw_update_stats_xskrq(struct mlx5e_sw_stats *s, s->rx_xsk_cqe_compress_blks += xskrq_stats->cqe_compress_blks; s->rx_xsk_cqe_compress_pkts += xskrq_stats->cqe_compress_pkts; s->rx_xsk_congst_umr += xskrq_stats->congst_umr; - s->rx_xsk_arfs_err += xskrq_stats->arfs_err; } static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s, @@ -354,7 +358,13 @@ static void mlx5e_stats_grp_sw_update_stats_rq_stats(struct mlx5e_sw_stats *s, s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks; s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts; s->rx_congst_umr += rq_stats->congst_umr; +#ifdef CONFIG_MLX5_EN_ARFS + s->rx_arfs_add += rq_stats->arfs_add; + s->rx_arfs_request_in += rq_stats->arfs_request_in; + s->rx_arfs_request_out += rq_stats->arfs_request_out; + s->rx_arfs_expired += rq_stats->arfs_expired; s->rx_arfs_err += rq_stats->arfs_err; +#endif s->rx_recover += rq_stats->recover; #ifdef CONFIG_PAGE_POOL_STATS s->rx_pp_alloc_fast += rq_stats->pp_alloc_fast; @@ -1990,7 +2000,13 @@ static const struct counter_desc rq_stats_desc[] = { { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) }, +#ifdef CONFIG_MLX5_EN_ARFS + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_add) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_request_in) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_request_out) }, + { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_expired) }, { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, arfs_err) }, +#endif { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, recover) }, #ifdef CONFIG_PAGE_POOL_STATS { MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, pp_alloc_fast) }, @@ -2092,7 +2108,6 @@ static const struct counter_desc xskrq_stats_desc[] = { { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, congst_umr) }, - { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, arfs_err) }, }; static const struct counter_desc xsksq_stats_desc[] = { @@ -2168,7 +2183,6 @@ static const struct counter_desc ptp_rq_stats_desc[] = { { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) }, { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) }, { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, congst_umr) }, - { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, arfs_err) }, { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, recover) }, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 409e9a47e433..176fa5976259 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -194,7 +194,13 @@ struct mlx5e_sw_stats { u64 rx_cqe_compress_blks; u64 rx_cqe_compress_pkts; u64 rx_congst_umr; +#ifdef CONFIG_MLX5_EN_ARFS + u64 rx_arfs_add; + u64 rx_arfs_request_in; + u64 rx_arfs_request_out; + u64 rx_arfs_expired; u64 rx_arfs_err; +#endif u64 rx_recover; u64 ch_events; u64 ch_poll; @@ -256,7 +262,6 @@ struct mlx5e_sw_stats { u64 rx_xsk_cqe_compress_blks; u64 rx_xsk_cqe_compress_pkts; u64 rx_xsk_congst_umr; - u64 rx_xsk_arfs_err; u64 tx_xsk_xmit; u64 tx_xsk_mpwqe; u64 tx_xsk_inlnw; @@ -358,7 +363,13 @@ struct mlx5e_rq_stats { u64 cqe_compress_blks; u64 cqe_compress_pkts; u64 congst_umr; +#ifdef CONFIG_MLX5_EN_ARFS + u64 arfs_add; + u64 arfs_request_in; + u64 arfs_request_out; + u64 arfs_expired; u64 arfs_err; +#endif u64 recover; #ifdef CONFIG_PAGE_POOL_STATS u64 pp_alloc_fast; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 0313432d50a1..234fd4c28e79 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -16,8 +16,7 @@ mlx5_esw_get_port_parent_id(struct mlx5_core_dev *dev, struct netdev_phys_item_i static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_num) { - return vport_num == MLX5_VPORT_UPLINK || - (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) || + return (mlx5_core_is_ecpf(esw->dev) && vport_num == MLX5_VPORT_PF) || mlx5_eswitch_is_vf_vport(esw, vport_num) || mlx5_core_is_ec_vf_vport(esw->dev, vport_num); } @@ -25,7 +24,6 @@ static bool mlx5_esw_devlink_port_supported(struct mlx5_eswitch *esw, u16 vport_ static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 vport_num) { struct mlx5_core_dev *dev = esw->dev; - struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; struct devlink_port *dl_port; u32 controller_num = 0; @@ -42,13 +40,7 @@ static struct devlink_port *mlx5_esw_dl_port_alloc(struct mlx5_eswitch *esw, u16 if (external) controller_num = dev->priv.eswitch->offloads.host_number + 1; - if (vport_num == MLX5_VPORT_UPLINK) { - attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; - attrs.phys.port_number = pfnum; - memcpy(attrs.switch_id.id, ppid.id, ppid.id_len); - attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_set(dl_port, &attrs); - } else if (vport_num == MLX5_VPORT_PF) { + if (vport_num == MLX5_VPORT_PF) { memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_pf_set(dl_port, controller_num, pfnum, external); @@ -71,7 +63,7 @@ static void mlx5_esw_dl_port_free(struct devlink_port *dl_port) kfree(dl_port); } -static const struct devlink_port_ops mlx5_esw_dl_port_ops = { +static const struct devlink_port_ops mlx5_esw_pf_vf_dl_port_ops = { .port_fn_hw_addr_get = mlx5_devlink_port_fn_hw_addr_get, .port_fn_hw_addr_set = mlx5_devlink_port_fn_hw_addr_set, .port_fn_roce_get = mlx5_devlink_port_fn_roce_get, @@ -103,7 +95,7 @@ int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_ devlink = priv_to_devlink(dev); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); err = devl_port_register_with_ops(devlink, dl_port, dl_port_index, - &mlx5_esw_dl_port_ops); + &mlx5_esw_pf_vf_dl_port_ops); if (err) goto reg_err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c index 455746952260..095f31f380fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/ipsec_fs.c @@ -316,7 +316,7 @@ void mlx5_esw_ipsec_restore_dest_uplink(struct mlx5_core_dev *mdev) err = mlx5_esw_ipsec_modify_flow_dests(esw, flow); if (err) mlx5_core_warn_once(mdev, - "Faided to modify flow dests for IPsec"); + "Failed to modify flow dests for IPsec"); } rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 46b8c60ac39a..c0b1b7b66cff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2542,11 +2542,9 @@ int mlx5_esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num) if (esw->mode != MLX5_ESWITCH_OFFLOADS) return 0; - if (vport_num != MLX5_VPORT_UPLINK) { - err = mlx5_esw_offloads_devlink_port_register(esw, vport_num); - if (err) - return err; - } + err = mlx5_esw_offloads_devlink_port_register(esw, vport_num); + if (err) + return err; err = mlx5_esw_offloads_rep_load(esw, vport_num); if (err) @@ -2554,8 +2552,7 @@ int mlx5_esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num) return err; load_err: - if (vport_num != MLX5_VPORT_UPLINK) - mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); return err; } @@ -2566,8 +2563,7 @@ void mlx5_esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num) mlx5_esw_offloads_rep_unload(esw, vport_num); - if (vport_num != MLX5_VPORT_UPLINK) - mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); + mlx5_esw_offloads_devlink_port_unregister(esw, vport_num); } static int esw_set_slave_root_fdb(struct mlx5_core_dev *master, @@ -3471,7 +3467,7 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) vport->info.link_state = MLX5_VPORT_ADMIN_STATE_DOWN; /* Uplink vport rep must load first. */ - err = mlx5_esw_offloads_load_rep(esw, MLX5_VPORT_UPLINK); + err = mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK); if (err) goto err_uplink; @@ -3482,7 +3478,7 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) return 0; err_vports: - mlx5_esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK); + mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK); err_uplink: esw_offloads_steering_cleanup(esw); err_steering_init: @@ -3520,7 +3516,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw, void esw_offloads_disable(struct mlx5_eswitch *esw) { mlx5_eswitch_disable_pf_vf_vports(esw); - mlx5_esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK); + mlx5_esw_offloads_rep_unload(esw, MLX5_VPORT_UPLINK); esw_set_passing_vport_metadata(esw, false); esw_offloads_steering_cleanup(esw); mapping_destroy(esw->offloads.reg_c0_obj_pool); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 187cb2c464f8..2fb2598b775e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -50,20 +50,6 @@ enum { }; enum { - MLX5_HEALTH_SYNDR_FW_ERR = 0x1, - MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, - MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8, - MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, - MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, - MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, - MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, - MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, - MLX5_HEALTH_SYNDR_EQ_INV = 0xe, - MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, - MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10 -}; - -enum { MLX5_DROP_HEALTH_WORK, }; @@ -357,27 +343,27 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev) static const char *hsynd_str(u8 synd) { switch (synd) { - case MLX5_HEALTH_SYNDR_FW_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR: return "firmware internal error"; - case MLX5_HEALTH_SYNDR_IRISC_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC: return "irisc not responding"; - case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR: return "unrecoverable hardware error"; - case MLX5_HEALTH_SYNDR_CRC_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR: return "firmware CRC error"; - case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR: return "ICM fetch PCI error"; - case MLX5_HEALTH_SYNDR_HW_FTL_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR: return "HW fatal error\n"; - case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN: return "async EQ buffer overrun"; - case MLX5_HEALTH_SYNDR_EQ_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR: return "EQ error"; - case MLX5_HEALTH_SYNDR_EQ_INV: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV: return "Invalid EQ referenced"; - case MLX5_HEALTH_SYNDR_FFSER_ERR: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR: return "FFSER error"; - case MLX5_HEALTH_SYNDR_HIGH_TEMP: + case MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR: return "High temperature"; default: return "unrecognized error"; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c index feb62d952643..00e67910e3ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c @@ -256,14 +256,15 @@ int mlx5_devcom_send_event(struct mlx5_devcom_comp_dev *devcom, int event, int rollback_event, void *event_data) { - struct mlx5_devcom_comp *comp = devcom->comp; struct mlx5_devcom_comp_dev *pos; + struct mlx5_devcom_comp *comp; int err = 0; void *data; if (IS_ERR_OR_NULL(devcom)) return -ENODEV; + comp = devcom->comp; down_write(&comp->sem); list_for_each_entry(pos, &comp->comp_dev_list_head, list) { data = rcu_dereference_protected(pos->data, lockdep_is_held(&comp->sem)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 33a133c9918c..653648216730 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -259,8 +259,11 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, int err; irq = kzalloc(sizeof(*irq), GFP_KERNEL); - if (!irq) + if (!irq || !zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) { + kfree(irq); return ERR_PTR(-ENOMEM); + } + if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) { /* The vector at index 0 is always statically allocated. If * dynamic irq is not supported all vectors are statically @@ -297,11 +300,7 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, mlx5_core_err(dev, "Failed to request irq. err = %d\n", err); goto err_req_irq; } - if (!zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) { - mlx5_core_warn(dev, "zalloc_cpumask_var failed\n"); - err = -ENOMEM; - goto err_cpumask; - } + if (af_desc) { cpumask_copy(irq->mask, &af_desc->mask); irq_set_affinity_and_hint(irq->map.virq, irq->mask); @@ -319,8 +318,6 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, err_xa: if (af_desc) irq_update_affinity_hint(irq->map.virq, NULL); - free_cpumask_var(irq->mask); -err_cpumask: free_irq(irq->map.virq, &irq->nh); err_req_irq: #ifdef CONFIG_RFS_ACCEL @@ -333,6 +330,7 @@ err_irq_rmap: if (i && pci_msix_can_alloc_dyn(dev->pdev)) pci_msix_free_irq(dev->pdev, irq->map); err_alloc_irq: + free_cpumask_var(irq->mask); kfree(irq); return ERR_PTR(err); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 54bb0866ed72..5b83da08692d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -1422,7 +1422,6 @@ dr_action_create_reformat_action(struct mlx5dr_domain *dmn, case DR_ACTION_TYP_TNL_L3_TO_L2: { u8 *hw_actions; - int ret; hw_actions = kzalloc(DR_ACTION_CACHE_LINE_SIZE, GFP_KERNEL); if (!hw_actions) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 6fa06ba2d346..4e8527a724f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -52,6 +52,7 @@ struct dr_qp_init_attr { u32 cqn; u32 pdn; u32 max_send_wr; + u32 max_send_sge; struct mlx5_uars_page *uar; u8 isolate_vl_tc:1; }; @@ -246,6 +247,37 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) return err == CQ_POLL_ERR ? err : npolled; } +static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr) +{ + return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_flow_update_ctrl_seg) + + sizeof(struct mlx5_wqe_header_modify_argument_update_seg)); +} + +/* We calculate for specific RC QP with the required functionality */ +static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr) +{ + int update_arg_size; + int inl_size = 0; + int tot_size; + int size; + + update_arg_size = dr_qp_get_args_update_send_wqe_size(attr); + + size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg); + inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) + + DR_STE_SIZE, 16); + + size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg); + + size = max(size, update_arg_size); + + tot_size = max(size, inl_size); + + return ALIGN(tot_size, MLX5_SEND_WQE_BB); +} + static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, struct dr_qp_init_attr *attr) { @@ -253,6 +285,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; struct mlx5_wq_param wqp; struct mlx5dr_qp *dr_qp; + int wqe_size; int inlen; void *qpc; void *in; @@ -332,6 +365,15 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, if (err) goto err_in; dr_qp->uar = attr->uar; + wqe_size = dr_qp_calc_rc_send_wqe(attr); + dr_qp->max_inline_data = min(wqe_size - + (sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_inline_seg)), + (2 * MLX5_SEND_WQE_BB - + (sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_inline_seg)))); return dr_qp; @@ -395,8 +437,48 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, MLX5_SEND_WQE_DS; } +static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp, + struct dr_data_seg *data_seg, void *wqe) +{ + int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_raddr_seg) + + sizeof(struct mlx5_wqe_inline_seg); + struct mlx5_wqe_inline_seg *seg; + int left_space; + int inl = 0; + void *addr; + int len; + int idx; + + seg = wqe; + wqe += sizeof(*seg); + addr = (void *)(unsigned long)(data_seg->addr); + len = data_seg->length; + inl += len; + left_space = MLX5_SEND_WQE_BB - inline_header_size; + + if (likely(len > left_space)) { + memcpy(wqe, addr, left_space); + len -= left_space; + addr += left_space; + idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1); + wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); + } + + memcpy(wqe, addr, len); + + if (likely(inl)) { + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + return DIV_ROUND_UP(inl + sizeof(seg->byte_count), + MLX5_SEND_WQE_DS); + } else { + return 0; + } +} + static void -dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, +dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp, + struct mlx5_wqe_ctrl_seg *wq_ctrl, u64 remote_addr, u32 rkey, struct dr_data_seg *data_seg, @@ -412,15 +494,17 @@ dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, wq_raddr->reserved = 0; wq_dseg = (void *)(wq_raddr + 1); + /* WQE ctrl segment + WQE remote addr segment */ + *size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS; - wq_dseg->byte_count = cpu_to_be32(data_seg->length); - wq_dseg->lkey = cpu_to_be32(data_seg->lkey); - wq_dseg->addr = cpu_to_be64(data_seg->addr); - - *size = (sizeof(*wq_ctrl) + /* WQE ctrl segment */ - sizeof(*wq_dseg) + /* WQE data segment */ - sizeof(*wq_raddr)) / /* WQE remote addr segment */ - MLX5_SEND_WQE_DS; + if (data_seg->send_flags & IB_SEND_INLINE) { + *size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg); + } else { + wq_dseg->byte_count = cpu_to_be32(data_seg->length); + wq_dseg->lkey = cpu_to_be32(data_seg->lkey); + wq_dseg->addr = cpu_to_be64(data_seg->addr); + *size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS; /* WQE data segment */ + } } static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl, @@ -451,7 +535,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, switch (opcode) { case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: - dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr, + dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr, rkey, data_seg, &size); break; case MLX5_OPCODE_FLOW_TBL_ACCESS: @@ -572,7 +656,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring, if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; else - send_info->write.send_flags = 0; + send_info->write.send_flags &= ~IB_SEND_SIGNALED; } static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, @@ -596,9 +680,13 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, } send_ring->pending_wqe++; + if (!send_info->write.lkey) + send_info->write.send_flags |= IB_SEND_INLINE; if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; + else + send_info->write.send_flags &= ~IB_SEND_SIGNALED; send_ring->pending_wqe++; send_info->read.length = send_info->write.length; @@ -608,9 +696,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, send_info->read.lkey = send_ring->sync_mr->mkey; if (send_ring->pending_wqe % send_ring->signal_th == 0) - send_info->read.send_flags = IB_SEND_SIGNALED; + send_info->read.send_flags |= IB_SEND_SIGNALED; else - send_info->read.send_flags = 0; + send_info->read.send_flags &= ~IB_SEND_SIGNALED; } static void dr_fill_data_segs(struct mlx5dr_domain *dmn, @@ -1257,6 +1345,7 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) dmn->send_ring->cq->qp = dmn->send_ring->qp; dmn->info.max_send_wr = QUEUE_SIZE; + init_attr.max_send_sge = 1; dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, DR_STE_SIZE); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index feb307fb3440..14f6df88b1f9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -336,7 +336,7 @@ static int mlx5_cmd_dr_create_fte(struct mlx5_flow_root_namespace *ns, if (fte->action.pkt_reformat->owner == MLX5_FLOW_RESOURCE_OWNER_FW) { err = -EINVAL; mlx5dr_err(domain, "FW-owned reformat can't be used in SW rule\n"); - goto free_actions; + goto free_actions; } is_decap = fte->action.pkt_reformat->reformat_type == diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index 89a9a7afa32c..6f565c0c0c3d 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -414,7 +414,6 @@ enum sparx5_pgid_type { }; void sparx5_pgid_init(struct sparx5 *spx5); -int sparx5_pgid_alloc_glag(struct sparx5 *spx5, u16 *idx); int sparx5_pgid_alloc_mcast(struct sparx5 *spx5, u16 *idx); int sparx5_pgid_free(struct sparx5 *spx5, u16 idx); diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api.h b/drivers/net/ethernet/microchip/vcap/vcap_api.h index 62db270f65af..9eccfa633c1a 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api.h +++ b/drivers/net/ethernet/microchip/vcap/vcap_api.h @@ -277,7 +277,4 @@ struct vcap_control { struct list_head list; /* list of vcap instances */ }; -/* Set client control interface on the API */ -int vcap_api_set_client(struct vcap_control *vctrl); - #endif /* __VCAP_API__ */ diff --git a/drivers/net/ethernet/microchip/vcap/vcap_api_client.h b/drivers/net/ethernet/microchip/vcap/vcap_api_client.h index d9d1f7c9d762..88641508f885 100644 --- a/drivers/net/ethernet/microchip/vcap/vcap_api_client.h +++ b/drivers/net/ethernet/microchip/vcap/vcap_api_client.h @@ -226,9 +226,6 @@ int vcap_chain_offset(struct vcap_control *vctrl, int from_cid, int to_cid); bool vcap_is_next_lookup(struct vcap_control *vctrl, int cur_cid, int next_cid); /* Is this chain id the last lookup of all VCAPs */ bool vcap_is_last_chain(struct vcap_control *vctrl, int cid, bool ingress); -/* Provide all rules via a callback interface */ -int vcap_rule_iter(struct vcap_control *vctrl, - int (*callback)(void *, struct vcap_rule *), void *arg); /* Match a list of keys against the keysets available in a vcap type */ bool vcap_rule_find_keysets(struct vcap_rule *rule, struct vcap_keyset_list *matches); diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h index 87f2055c242c..e50be508c166 100644 --- a/drivers/net/ethernet/mscc/ocelot.h +++ b/drivers/net/ethernet/mscc/ocelot.h @@ -97,8 +97,6 @@ int ocelot_netdev_to_port(struct net_device *dev); int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target, struct device_node *portnp); void ocelot_release_port(struct ocelot_port *ocelot_port); -int ocelot_devlink_init(struct ocelot *ocelot); -void ocelot_devlink_teardown(struct ocelot *ocelot); int ocelot_port_devlink_init(struct ocelot *ocelot, int port, enum devlink_port_flavour flavour); void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port); diff --git a/drivers/net/ethernet/mscc/ocelot_vcap.h b/drivers/net/ethernet/mscc/ocelot_vcap.h index 523611ccc48f..6f546695faa5 100644 --- a/drivers/net/ethernet/mscc/ocelot_vcap.h +++ b/drivers/net/ethernet/mscc/ocelot_vcap.h @@ -15,7 +15,6 @@ int ocelot_vcap_filter_stats_update(struct ocelot *ocelot, struct ocelot_vcap_filter *rule); -void ocelot_detect_vcap_constants(struct ocelot *ocelot); int ocelot_vcap_init(struct ocelot *ocelot); int ocelot_setup_tc_cls_flower(struct ocelot_port_private *priv, diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 602f4d45d529..2453a40f6ee8 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -81,7 +81,6 @@ int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_wait); int ionic_dev_cmd_wait_nomsg(struct ionic *ionic, unsigned long max_wait); void ionic_dev_cmd_dev_err_print(struct ionic *ionic, u8 opcode, u8 status, int err); -int ionic_set_dma_mask(struct ionic *ionic); int ionic_setup(struct ionic *ionic); int ionic_identify(struct ionic *ionic); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h index 0bea208bfba2..6aac98bcb9f4 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h @@ -376,7 +376,6 @@ void ionic_q_cmb_map(struct ionic_queue *q, void __iomem *base, dma_addr_t base_ void ionic_q_sg_map(struct ionic_queue *q, void *base, dma_addr_t base_pa); void ionic_q_post(struct ionic_queue *q, bool ring_doorbell, ionic_desc_cb cb, void *cb_arg); -void ionic_q_rewind(struct ionic_queue *q, struct ionic_desc_info *start); void ionic_q_service(struct ionic_queue *q, struct ionic_cq_info *cq_info, unsigned int stop_index); int ionic_heartbeat_check(struct ionic *ionic); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h index 87b2666f248b..ee9e99cd1b5e 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.h @@ -43,7 +43,6 @@ struct ionic_rx_filter *ionic_rx_filter_by_addr(struct ionic_lif *lif, const u8 struct ionic_rx_filter *ionic_rx_filter_rxsteer(struct ionic_lif *lif); void ionic_rx_filter_sync(struct ionic_lif *lif); int ionic_lif_list_addr(struct ionic_lif *lif, const u8 *addr, bool mode); -int ionic_rx_filters_need_sync(struct ionic_lif *lif); int ionic_lif_vlan_add(struct ionic_lif *lif, const u16 vid); int ionic_lif_vlan_del(struct ionic_lif *lif, const u16 vid); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index bf40c88fbd9b..f3dad2ab9828 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -192,7 +192,6 @@ struct am65_cpsw_ndev_priv { extern const struct ethtool_ops am65_cpsw_ethtool_ops_slave; -void am65_cpsw_nuss_adjust_link(struct net_device *ndev); void am65_cpsw_nuss_set_p0_ptype(struct am65_cpsw_common *common); void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common); int am65_cpsw_nuss_update_tx_chns(struct am65_cpsw_common *common, int num_tx); diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h index 43d5cd59b56b..7007eb8bed36 100644 --- a/drivers/net/ethernet/ti/netcp.h +++ b/drivers/net/ethernet/ti/netcp.h @@ -233,8 +233,6 @@ int netcp_register_rxhook(struct netcp_intf *netcp_priv, int order, netcp_hook_rtn *hook_rtn, void *hook_data); int netcp_unregister_rxhook(struct netcp_intf *netcp_priv, int order, netcp_hook_rtn *hook_rtn, void *hook_data); -void *netcp_device_find_module(struct netcp_device *netcp_device, - const char *name); /* SGMII functions */ int netcp_sgmii_reset(void __iomem *sgmii_ofs, int port); diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 6043e63b42f9..43f374444684 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -638,9 +638,7 @@ static void vrf_finish_direct(struct sk_buff *skb) eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; - rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); - rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index c3ff30ab782e..9c59d0bf8c3d 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -696,7 +696,7 @@ static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan, { struct vxlan_vni_node *vninode; - vninode = kzalloc(sizeof(*vninode), GFP_ATOMIC); + vninode = kzalloc(sizeof(*vninode), GFP_KERNEL); if (!vninode) return NULL; vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu); diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3c5c68618fcc..fb996124b3d5 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -96,6 +96,27 @@ struct mptcp_out_options { #endif }; +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SUBFLOWS_MAX 8 + +struct mptcp_sched_data { + bool reinject; + u8 subflows; + struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; +}; + +struct mptcp_sched_ops { + int (*get_subflow)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index a8c2817335b9..1e1b40f4e664 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1165,7 +1165,6 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port, struct ethtool_mm_state *state); int ocelot_port_mqprio(struct ocelot *ocelot, int port, struct tc_mqprio_qopt_offload *mqprio); -void ocelot_port_update_preemptible_tcs(struct ocelot *ocelot, int port); #if IS_ENABLED(CONFIG_BRIDGE_MRP) int ocelot_mrp_add(struct ocelot *ocelot, int port, diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a3829ce548f9..84e531f86b82 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index ae20b7d92e28..c46c22a84d23 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -32,6 +32,7 @@ struct mptcp_pernet { u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; + char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_scheduler(const struct net *net) +{ + return mptcp_get_pernet(net)->scheduler; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; + strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL @@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, + { + .procname = "scheduler", + .maxlen = MPTCP_SCHED_NAME_MAX, + .mode = 0644, + .proc_handler = proc_dostring, + }, {} }; @@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; + table[6].data = &pernet->scheduler; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7dbbad1e4f55..d8da5374d9e1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); - if (subflow->backup != bkup) { + if (subflow->backup != bkup) subflow->backup = bkup; - mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - msk->last_snd = NULL; - else - __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); - mptcp_data_unlock(sk); - } mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c75d9d88a053..9661f3812682 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { - if (subflow->backup != backup) - msk->last_snd = NULL; - subflow->send_mp_prio = 1; subflow->backup = backup; subflow->request_bkup = backup; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6019a3cf1625..933b257eee02 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; @@ -1377,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) { - if (!msk->first) - return NULL; - return __tcp_can_send(msk->first) && - sk_stream_memory_free(msk->first) ? msk->first : NULL; - } - - /* re-use last subflow, if the burst allow that */ - if (msk->last_snd && msk->snd_burst > 0 && - sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_set_timeout(sk); - return msk->last_snd; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1446,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); - if (!burst) { - msk->last_snd = NULL; + if (!burst) return ssk; - } subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); - msk->last_snd = ssk; msk->snd_burst = burst; return ssk; } @@ -1499,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } -void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static int __subflow_push_pending(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info = { - .flags = flags, - }; - bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len; + int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; + info->sent = dfrag->already_sent; + info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); - - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; - - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); + err = copied ? : ret; goto out; } - do_check_data_fin = true; - info.sent += ret; + info->sent += ret; + copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + if (msk->snd_burst <= 0 || + !sk_stream_memory_free(ssk) || + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { + err = copied; + goto out; + } + mptcp_set_timeout(sk); + } + err = copied; + +out: + return err; +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + bool do_check_data_fin = false; + int push_count = 1; + + while (mptcp_send_head(sk) && (push_count > 0)) { + struct mptcp_subflow_context *subflow; + int ret = 0; + + if (mptcp_sched_get_send(msk)) + break; + + push_count = 0; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + + prev_ssk = ssk; + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk != prev_ssk) { + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (prev_ssk) + mptcp_push_release(prev_ssk, &info); + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + lock_sock(ssk); + } + + push_count++; + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret != -EAGAIN || + (1 << ssk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) + push_count--; + continue; + } + do_check_data_fin = true; + } + } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); -out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -1570,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; - struct mptcp_data_frag *dfrag; + bool keep_pushing = true; struct sock *xmit_ssk; - int len, copied = 0; + int copied = 0; info.flags = 0; - while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; - len = dfrag->data_len - dfrag->already_sent; - while (len > 0) { - int ret = 0; - - /* check for a different subflow usage only after - * spooling the first chunk of data - */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); - goto out; - } + while (mptcp_send_head(sk) && keep_pushing) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + int ret = 0; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + /* check for a different subflow usage only after + * spooling the first chunk of data + */ + if (first) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + first = false; if (ret <= 0) - goto out; + break; + copied += ret; + continue; + } + + if (mptcp_sched_get_send(msk)) + goto out; - info.sent += ret; + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) + keep_pushing = false; copied += ret; - len -= ret; - first = false; + } - mptcp_update_post_push(msk, dfrag, ret); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + xmit_ssk = mptcp_subflow_tcp_sock(subflow); + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(subflow, + MPTCP_DELEGATE_SEND); + keep_pushing = false; + } + } } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } out: @@ -2198,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) - return NULL; - mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2370,9 +2394,6 @@ out_release: WRITE_ONCE(msk->first, NULL); out: - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (need_push) __mptcp_push_pending(sk, 0); } @@ -2489,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - size_t copied = 0; struct sock *ssk; - int ret; + int ret, err; + u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ - ssk = mptcp_subflow_get_retrans(msk); + err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2517,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - if (!ssk) + if (err) goto reset_timer; - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + u16 copied = 0; - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; - while (info.sent < info.limit) { - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; + mptcp_subflow_set_scheduled(subflow, false); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) { - dfrag->already_sent = max(dfrag->already_sent, info.sent); - msk->bytes_retrans += copied; - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : + dfrag->already_sent; + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) { + len = max(copied, len); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } + + release_sock(ssk); + } } - release_sock(ssk); + msk->bytes_retrans += len; + dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); @@ -2694,6 +2729,7 @@ static void mptcp_ca_reset(struct sock *sk) static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); + int ret; __mptcp_init_sock(sk); @@ -2703,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; + ret = mptcp_init_sched(mptcp_sk(sk), + mptcp_sched_find(mptcp_get_scheduler(net))); + if (ret) + return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2848,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + mptcp_release_sched(msk); sk->sk_prot->destroy(sk); @@ -3037,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); - msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->push_pending = 0; @@ -3103,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; @@ -3307,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) - msk->last_snd = NULL; } __mptcp_update_rmem(sk); @@ -3925,6 +3965,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 38c7ea013361..7254b3562575 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -123,7 +123,6 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 -#define MPTCP_RESET_SCHEDULER 7 struct mptcp_skb_cb { u64 map_seq; @@ -269,7 +268,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; int rmem_fwd_alloc; - struct sock *last_snd; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -314,6 +312,7 @@ struct mptcp_sock { * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; + struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ @@ -492,6 +491,7 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; enum mptcp_data_avail data_avail; + bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_scheduler(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_register_scheduler(struct mptcp_sched_ops *sched); +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +void mptcp_sched_init(void); +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched); +void mptcp_release_sched(struct mptcp_sock *msk); +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled); +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); +int mptcp_sched_get_send(struct mptcp_sock *msk); +int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c new file mode 100644 index 000000000000..4ab0693c069c --- /dev/null +++ b/net/mptcp/sched.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, SUSE. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include "protocol.h" + +static DEFINE_SPINLOCK(mptcp_sched_list_lock); +static LIST_HEAD(mptcp_sched_list); + +static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : + mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static struct mptcp_sched_ops mptcp_sched_default = { + .get_subflow = mptcp_sched_default_get_subflow, + .name = "default", + .owner = THIS_MODULE, +}; + +/* Must be called with rcu read lock held */ +struct mptcp_sched_ops *mptcp_sched_find(const char *name) +{ + struct mptcp_sched_ops *sched, *ret = NULL; + + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + if (!strcmp(sched->name, name)) { + ret = sched; + break; + } + } + + return ret; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + if (!sched->get_subflow) + return -EINVAL; + + spin_lock(&mptcp_sched_list_lock); + if (mptcp_sched_find(sched->name)) { + spin_unlock(&mptcp_sched_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&sched->list, &mptcp_sched_list); + spin_unlock(&mptcp_sched_list_lock); + + pr_debug("%s registered", sched->name); + return 0; +} + +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) +{ + if (sched == &mptcp_sched_default) + return; + + spin_lock(&mptcp_sched_list_lock); + list_del_rcu(&sched->list); + spin_unlock(&mptcp_sched_list_lock); +} + +void mptcp_sched_init(void) +{ + mptcp_register_scheduler(&mptcp_sched_default); +} + +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched) +{ + if (!sched) + sched = &mptcp_sched_default; + + if (!bpf_try_module_get(sched, sched->owner)) + return -EBUSY; + + msk->sched = sched; + if (msk->sched->init) + msk->sched->init(msk); + + pr_debug("sched=%s", msk->sched->name); + + return 0; +} + +void mptcp_release_sched(struct mptcp_sock *msk) +{ + struct mptcp_sched_ops *sched = msk->sched; + + if (!sched) + return; + + msk->sched = NULL; + if (sched->release) + sched->release(msk); + + bpf_module_put(sched, sched->owner); +} + +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled) +{ + WRITE_ONCE(subflow->scheduled, scheduled); +} + +int mptcp_sched_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_send */ + if (__mptcp_check_fallback(msk)) { + if (msk->first && + __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first)) { + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); + return 0; + } + return -EINVAL; + } + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = false; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} + +int mptcp_sched_get_retrans(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_retrans */ + if (__mptcp_check_fallback(msk)) + return -EINVAL; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = true; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} |