diff options
-rw-r--r-- | Documentation/networking/driver.rst | 119 | ||||
-rw-r--r-- | drivers/net/ethernet/broadcom/bnxt/bnxt.c | 42 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 42 | ||||
-rw-r--r-- | include/linux/netdevice.h | 3 | ||||
-rw-r--r-- | include/net/netdev_queues.h | 163 |
5 files changed, 262 insertions, 107 deletions
diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst index 64f7236ff10b..4071f2c00f8b 100644 --- a/Documentation/networking/driver.rst +++ b/Documentation/networking/driver.rst @@ -4,15 +4,48 @@ Softnet Driver Issues ===================== -Transmit path guidelines: +Probing guidelines +================== -1) The ndo_start_xmit method must not return NETDEV_TX_BUSY under - any normal circumstances. It is considered a hard error unless - there is no way your device can tell ahead of time when its - transmit function will become busy. +Address validation +------------------ - Instead it must maintain the queue properly. For example, - for a driver implementing scatter-gather this means:: +Any hardware layer address you obtain for your device should +be verified. For example, for ethernet check it with +linux/etherdevice.h:is_valid_ether_addr() + +Close/stop guidelines +===================== + +Quiescence +---------- + +After the ndo_stop routine has been called, the hardware must +not receive or transmit any data. All in flight packets must +be aborted. If necessary, poll or wait for completion of +any reset commands. + +Auto-close +---------- + +The ndo_stop routine will be called by unregister_netdevice +if device is still UP. + +Transmit path guidelines +======================== + +Stop queues in advance +---------------------- + +The ndo_start_xmit method must not return NETDEV_TX_BUSY under +any normal circumstances. It is considered a hard error unless +there is no way your device can tell ahead of time when its +transmit function will become busy. + +Instead it must maintain the queue properly. For example, +for a driver implementing scatter-gather this means: + +.. code-block:: c static netdev_tx_t drv_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -20,7 +53,7 @@ Transmit path guidelines: struct drv *dp = netdev_priv(dev); lock_tx(dp); - ... + //... /* This is a hard error log it. */ if (TX_BUFFS_AVAIL(dp) <= (skb_shinfo(skb)->nr_frags + 1)) { netif_stop_queue(dev); @@ -30,68 +63,72 @@ Transmit path guidelines: return NETDEV_TX_BUSY; } - ... queue packet to card ... - ... update tx consumer index ... + //... queue packet to card ... + //... update tx consumer index ... if (TX_BUFFS_AVAIL(dp) <= (MAX_SKB_FRAGS + 1)) netif_stop_queue(dev); - ... + //... unlock_tx(dp); - ... + //... return NETDEV_TX_OK; } - And then at the end of your TX reclamation event handling:: +And then at the end of your TX reclamation event handling: + +.. code-block:: c if (netif_queue_stopped(dp->dev) && TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1)) netif_wake_queue(dp->dev); - For a non-scatter-gather supporting card, the three tests simply become:: +For a non-scatter-gather supporting card, the three tests simply become: + +.. code-block:: c /* This is a hard error log it. */ if (TX_BUFFS_AVAIL(dp) <= 0) - and:: +and: + +.. code-block:: c if (TX_BUFFS_AVAIL(dp) == 0) - and:: +and: + +.. code-block:: c if (netif_queue_stopped(dp->dev) && TX_BUFFS_AVAIL(dp) > 0) netif_wake_queue(dp->dev); -2) An ndo_start_xmit method must not modify the shared parts of a - cloned SKB. - -3) Do not forget that once you return NETDEV_TX_OK from your - ndo_start_xmit method, it is your driver's responsibility to free - up the SKB and in some finite amount of time. +Lockless queue stop / wake helper macros +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - For example, this means that it is not allowed for your TX - mitigation scheme to let TX packets "hang out" in the TX - ring unreclaimed forever if no new TX packets are sent. - This error can deadlock sockets waiting for send buffer room - to be freed up. +.. kernel-doc:: include/net/netdev_queues.h + :doc: Lockless queue stopping / waking helpers. - If you return NETDEV_TX_BUSY from the ndo_start_xmit method, you - must not keep any reference to that SKB and you must not attempt - to free it up. +No exclusive ownership +---------------------- -Probing guidelines: +An ndo_start_xmit method must not modify the shared parts of a +cloned SKB. -1) Any hardware layer address you obtain for your device should - be verified. For example, for ethernet check it with - linux/etherdevice.h:is_valid_ether_addr() +Timely completions +------------------ -Close/stop guidelines: +Do not forget that once you return NETDEV_TX_OK from your +ndo_start_xmit method, it is your driver's responsibility to free +up the SKB and in some finite amount of time. -1) After the ndo_stop routine has been called, the hardware must - not receive or transmit any data. All in flight packets must - be aborted. If necessary, poll or wait for completion of - any reset commands. +For example, this means that it is not allowed for your TX +mitigation scheme to let TX packets "hang out" in the TX +ring unreclaimed forever if no new TX packets are sent. +This error can deadlock sockets waiting for send buffer room +to be freed up. -2) The ndo_stop routine will be called by unregister_netdevice - if device is still UP. +If you return NETDEV_TX_BUSY from the ndo_start_xmit method, you +must not keep any reference to that SKB and you must not attempt +to free it up. diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8ff5a4f98d6f..f7602d8d79e3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -56,6 +56,7 @@ #include <linux/hwmon-sysfs.h> #include <net/page_pool.h> #include <linux/align.h> +#include <net/netdev_queues.h> #include "bnxt_hsi.h" #include "bnxt.h" @@ -331,26 +332,6 @@ static void bnxt_txr_db_kick(struct bnxt *bp, struct bnxt_tx_ring_info *txr, txr->kick_pending = 0; } -static bool bnxt_txr_netif_try_stop_queue(struct bnxt *bp, - struct bnxt_tx_ring_info *txr, - struct netdev_queue *txq) -{ - netif_tx_stop_queue(txq); - - /* netif_tx_stop_queue() must be done before checking - * tx index in bnxt_tx_avail() below, because in - * bnxt_tx_int(), we update tx index before checking for - * netif_tx_queue_stopped(). - */ - smp_mb(); - if (bnxt_tx_avail(bp, txr) >= bp->tx_wake_thresh) { - netif_tx_wake_queue(txq); - return false; - } - - return true; -} - static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bnxt *bp = netdev_priv(dev); @@ -384,7 +365,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) if (net_ratelimit() && txr->kick_pending) netif_warn(bp, tx_err, dev, "bnxt: ring busy w/ flush pending!\n"); - if (bnxt_txr_netif_try_stop_queue(bp, txr, txq)) + if (!netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh)) return NETDEV_TX_BUSY; } @@ -614,7 +596,8 @@ tx_done: if (netdev_xmit_more() && !tx_buf->is_push) bnxt_txr_db_kick(bp, txr, prod); - bnxt_txr_netif_try_stop_queue(bp, txr, txq); + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); } return NETDEV_TX_OK; @@ -705,20 +688,11 @@ next_tx_int: dev_kfree_skb_any(skb); } - netdev_tx_completed_queue(txq, nr_pkts, tx_bytes); txr->tx_cons = cons; - /* Need to make the tx_cons update visible to bnxt_start_xmit() - * before checking for netif_tx_queue_stopped(). Without the - * memory barrier, there is a small possibility that bnxt_start_xmit() - * will miss it and cause the queue to be stopped forever. - */ - smp_mb(); - - if (unlikely(netif_tx_queue_stopped(txq)) && - bnxt_tx_avail(bp, txr) >= bp->tx_wake_thresh && - READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING) - netif_tx_wake_queue(txq); + __netif_txq_completed_wake(txq, nr_pkts, tx_bytes, + bnxt_tx_avail(bp, txr), bp->tx_wake_thresh, + READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING); } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 773c35fecace..f2604fc05991 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -36,6 +36,7 @@ #include <net/tc_act/tc_mirred.h> #include <net/vxlan.h> #include <net/mpls.h> +#include <net/netdev_queues.h> #include <net/xdp_sock_drv.h> #include <net/xfrm.h> @@ -1119,6 +1120,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, unsigned int total_bytes = 0, total_packets = 0, total_ipsec = 0; unsigned int budget = q_vector->tx.work_limit; unsigned int i = tx_ring->next_to_clean; + struct netdev_queue *txq; if (test_bit(__IXGBE_DOWN, &adapter->state)) return true; @@ -1249,24 +1251,14 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, if (ring_is_xdp(tx_ring)) return !!budget; - netdev_tx_completed_queue(txring_txq(tx_ring), - total_packets, total_bytes); - #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) - if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) && - (ixgbe_desc_unused(tx_ring) >= TX_WAKE_THRESHOLD))) { - /* Make sure that anybody stopping the queue after this - * sees the new next_to_clean. - */ - smp_mb(); - if (__netif_subqueue_stopped(tx_ring->netdev, - tx_ring->queue_index) - && !test_bit(__IXGBE_DOWN, &adapter->state)) { - netif_wake_subqueue(tx_ring->netdev, - tx_ring->queue_index); - ++tx_ring->tx_stats.restart_queue; - } - } + txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index); + if (!__netif_txq_completed_wake(txq, total_packets, total_bytes, + ixgbe_desc_unused(tx_ring), + TX_WAKE_THRESHOLD, + netif_carrier_ok(tx_ring->netdev) && + test_bit(__IXGBE_DOWN, &adapter->state))) + ++tx_ring->tx_stats.restart_queue; return !!budget; } @@ -8270,22 +8262,10 @@ static void ixgbe_tx_olinfo_status(union ixgbe_adv_tx_desc *tx_desc, static int __ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size) { - netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index); - - /* Herbert's original patch had: - * smp_mb__after_netif_stop_queue(); - * but since that doesn't exist yet, just open code it. - */ - smp_mb(); - - /* We need to check again in a case another CPU has just - * made room available. - */ - if (likely(ixgbe_desc_unused(tx_ring) < size)) + if (!netif_subqueue_try_stop(tx_ring->netdev, tx_ring->queue_index, + ixgbe_desc_unused(tx_ring), size)) return -EBUSY; - /* A reprieve! - use start_queue because it doesn't call schedule */ - netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index); ++tx_ring->tx_stats.restart_queue; return 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c25b39681b3..fe355592dfde 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3335,6 +3335,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } @@ -3531,7 +3532,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ - smp_mb(); + smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */ if (unlikely(dql_avail(&dev_queue->dql) < 0)) return; diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h new file mode 100644 index 000000000000..b26fdb441e39 --- /dev/null +++ b/include/net/netdev_queues.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NET_QUEUES_H +#define _LINUX_NET_QUEUES_H + +#include <linux/netdevice.h> + +/** + * DOC: Lockless queue stopping / waking helpers. + * + * The netif_txq_maybe_stop() and __netif_txq_completed_wake() + * macros are designed to safely implement stopping + * and waking netdev queues without full lock protection. + * + * We assume that there can be no concurrent stop attempts and no concurrent + * wake attempts. The try-stop should happen from the xmit handler, + * while wake up should be triggered from NAPI poll context. + * The two may run concurrently (single producer, single consumer). + * + * The try-stop side is expected to run from the xmit handler and therefore + * it does not reschedule Tx (netif_tx_start_queue() instead of + * netif_tx_wake_queue()). Uses of the ``stop`` macros outside of the xmit + * handler may lead to xmit queue being enabled but not run. + * The waking side does not have similar context restrictions. + * + * The macros guarantee that rings will not remain stopped if there's + * space available, but they do *not* prevent false wake ups when + * the ring is full! Drivers should check for ring full at the start + * for the xmit handler. + * + * All descriptor ring indexes (and other relevant shared state) must + * be updated before invoking the macros. + */ + +#define netif_txq_try_stop(txq, get_desc, start_thrs) \ + ({ \ + int _res; \ + \ + netif_tx_stop_queue(txq); \ + /* Producer index and stop bit must be visible \ + * to consumer before we recheck. \ + * Pairs with a barrier in __netif_txq_completed_wake(). \ + */ \ + smp_mb__after_atomic(); \ + \ + /* We need to check again in a case another \ + * CPU has just made room available. \ + */ \ + _res = 0; \ + if (unlikely(get_desc >= start_thrs)) { \ + netif_tx_start_queue(txq); \ + _res = -1; \ + } \ + _res; \ + }) \ + +/** + * netif_txq_maybe_stop() - locklessly stop a Tx queue, if needed + * @txq: struct netdev_queue to stop/start + * @get_desc: get current number of free descriptors (see requirements below!) + * @stop_thrs: minimal number of available descriptors for queue to be left + * enabled + * @start_thrs: minimal number of descriptors to re-enable the queue, can be + * equal to @stop_thrs or higher to avoid frequent waking + * + * All arguments may be evaluated multiple times, beware of side effects. + * @get_desc must be a formula or a function call, it must always + * return up-to-date information when evaluated! + * Expected to be used from ndo_start_xmit, see the comment on top of the file. + * + * Returns: + * 0 if the queue was stopped + * 1 if the queue was left enabled + * -1 if the queue was re-enabled (raced with waking) + */ +#define netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs) \ + ({ \ + int _res; \ + \ + _res = 1; \ + if (unlikely(get_desc < stop_thrs)) \ + _res = netif_txq_try_stop(txq, get_desc, start_thrs); \ + _res; \ + }) \ + +/* Variant of netdev_tx_completed_queue() which guarantees smp_mb() if + * @bytes != 0, regardless of kernel config. + */ +static inline void +netdev_txq_completed_mb(struct netdev_queue *dev_queue, + unsigned int pkts, unsigned int bytes) +{ + if (IS_ENABLED(CONFIG_BQL)) + netdev_tx_completed_queue(dev_queue, pkts, bytes); + else if (bytes) + smp_mb(); +} + +/** + * __netif_txq_completed_wake() - locklessly wake a Tx queue, if needed + * @txq: struct netdev_queue to stop/start + * @pkts: number of packets completed + * @bytes: number of bytes completed + * @get_desc: get current number of free descriptors (see requirements below!) + * @start_thrs: minimal number of descriptors to re-enable the queue + * @down_cond: down condition, predicate indicating that the queue should + * not be woken up even if descriptors are available + * + * All arguments may be evaluated multiple times. + * @get_desc must be a formula or a function call, it must always + * return up-to-date information when evaluated! + * Reports completed pkts/bytes to BQL. + * + * Returns: + * 0 if the queue was woken up + * 1 if the queue was already enabled (or disabled but @down_cond is true) + * -1 if the queue was left unchanged (@start_thrs not reached) + */ +#define __netif_txq_completed_wake(txq, pkts, bytes, \ + get_desc, start_thrs, down_cond) \ + ({ \ + int _res; \ + \ + /* Report to BQL and piggy back on its barrier. \ + * Barrier makes sure that anybody stopping the queue \ + * after this point sees the new consumer index. \ + * Pairs with barrier in netif_txq_try_stop(). \ + */ \ + netdev_txq_completed_mb(txq, pkts, bytes); \ + \ + _res = -1; \ + if (pkts && likely(get_desc > start_thrs)) { \ + _res = 1; \ + if (unlikely(netif_tx_queue_stopped(txq)) && \ + !(down_cond)) { \ + netif_tx_wake_queue(txq); \ + _res = 0; \ + } \ + } \ + _res; \ + }) + +#define netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs) \ + __netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs, false) + +/* subqueue variants follow */ + +#define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ + ({ \ + struct netdev_queue *txq; \ + \ + txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(txq, get_desc, start_thrs); \ + }) + +#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ + ({ \ + struct netdev_queue *txq; \ + \ + txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + }) + +#endif |