diff options
Diffstat (limited to 'drivers/net/ethernet/google')
-rw-r--r-- | drivers/net/ethernet/google/Kconfig | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/Makefile | 2 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve.h | 332 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.c | 334 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_adminq.h | 112 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_desc_dqo.h | 256 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_dqo.h | 81 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_ethtool.c | 21 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_main.c | 335 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx.c | 54 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_rx_dqo.c | 756 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx.c | 35 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_tx_dqo.c | 1030 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.c | 81 | ||||
-rw-r--r-- | drivers/net/ethernet/google/gve/gve_utils.h | 28 |
15 files changed, 3223 insertions, 236 deletions
diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index b8f04d052fda..8641a00f8e63 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_GOOGLE config GVE tristate "Google Virtual NIC (gVNIC) support" - depends on PCI_MSI + depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN)) help This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 3354ce40eb97..b9a6be76531b 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,4 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_rx.o gve_ethtool.o gve_adminq.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index daf07c0f790b..1d3188e8e3b3 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0 OR MIT) * Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #ifndef _GVE_H_ @@ -11,7 +11,9 @@ #include <linux/netdevice.h> #include <linux/pci.h> #include <linux/u64_stats_sync.h> + #include "gve_desc.h" +#include "gve_desc_dqo.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 @@ -40,6 +42,11 @@ #define GVE_DATA_SLOT_ADDR_PAGE_MASK (~(PAGE_SIZE - 1)) +/* PTYPEs are always 10 bits. */ +#define GVE_NUM_PTYPES 1024 + +#define GVE_RX_BUFFER_SIZE_DQO 2048 + /* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */ struct gve_rx_desc_queue { struct gve_rx_desc *desc_ring; /* the descriptor ring */ @@ -51,7 +58,8 @@ struct gve_rx_desc_queue { struct gve_rx_slot_page_info { struct page *page; void *page_address; - u8 page_offset; /* flipped to second half? */ + u32 page_offset; /* offset to write to in page */ + int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ u8 can_flip; }; @@ -76,17 +84,117 @@ struct gve_rx_data_queue { struct gve_priv; -/* An RX ring that contains a power-of-two sized desc and data ring. */ +/* RX buffer queue for posting buffers to HW. + * Each RX (completion) queue has a corresponding buffer queue. + */ +struct gve_rx_buf_queue_dqo { + struct gve_rx_desc_dqo *desc_ring; + dma_addr_t bus; + u32 head; /* Pointer to start cleaning buffers at. */ + u32 tail; /* Last posted buffer index + 1 */ + u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* RX completion queue to receive packets from HW. */ +struct gve_rx_compl_queue_dqo { + struct gve_rx_compl_desc_dqo *desc_ring; + dma_addr_t bus; + + /* Number of slots which did not have a buffer posted yet. We should not + * post more buffers than the queue size to avoid HW overrunning the + * queue. + */ + int num_free_slots; + + /* HW uses a "generation bit" to notify SW of new descriptors. When a + * descriptor's generation bit is different from the current generation, + * that descriptor is ready to be consumed by SW. + */ + u8 cur_gen_bit; + + /* Pointer into desc_ring where the next completion descriptor will be + * received. + */ + u32 head; + u32 mask; /* Mask for indices to the size of the ring */ +}; + +/* Stores state for tracking buffers posted to HW */ +struct gve_rx_buf_state_dqo { + /* The page posted to HW. */ + struct gve_rx_slot_page_info page_info; + + /* The DMA address corresponding to `page_info`. */ + dma_addr_t addr; + + /* Last offset into the page when it only had a single reference, at + * which point every other offset is free to be reused. + */ + u32 last_single_ref_offset; + + /* Linked list index to next element in the list, or -1 if none */ + s16 next; +}; + +/* `head` and `tail` are indices into an array, or -1 if empty. */ +struct gve_index_list { + s16 head; + s16 tail; +}; + +/* Contains datapath state used to represent an RX queue. */ struct gve_rx_ring { struct gve_priv *gve; - struct gve_rx_desc_queue desc; - struct gve_rx_data_queue data; + union { + /* GQI fields */ + struct { + struct gve_rx_desc_queue desc; + struct gve_rx_data_queue data; + + /* threshold for posting new buffs and descs */ + u32 db_threshold; + }; + + /* DQO fields. */ + struct { + struct gve_rx_buf_queue_dqo bufq; + struct gve_rx_compl_queue_dqo complq; + + struct gve_rx_buf_state_dqo *buf_states; + u16 num_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Index into + * buf_states, or -1 if empty. + */ + s16 free_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Indexes into + * buf_states, or -1 if empty. + * + * This list contains buf_states which are pointing to + * valid buffers. + * + * We use a FIFO here in order to increase the + * probability that buffers can be reused by increasing + * the time between usages. + */ + struct gve_index_list recycled_buf_states; + + /* Linked list of gve_rx_buf_state_dqo. Indexes into + * buf_states, or -1 if empty. + * + * This list contains buf_states which have buffers + * which cannot be reused yet. + */ + struct gve_index_list used_buf_states; + } dqo; + }; + u64 rbytes; /* free-running bytes received */ u64 rpackets; /* free-running packets received */ u32 cnt; /* free-running total number of completed packets */ u32 fill_cnt; /* free-running total number of descs and buffs posted */ u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ - u32 db_threshold; /* threshold for posting new buffs and descs */ u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ u64 rx_copied_pkt; /* free-running total number of copied packets */ u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ @@ -97,6 +205,10 @@ struct gve_rx_ring { struct gve_queue_resources *q_resources; /* head and tail pointer idx */ dma_addr_t q_resources_bus; /* dma address for the queue resources */ struct u64_stats_sync statss; /* sync stats for 32bit archs */ + + /* head and tail of skb chain for the current packet or NULL if none */ + struct sk_buff *skb_head; + struct sk_buff *skb_tail; }; /* A TX desc ring entry */ @@ -137,23 +249,161 @@ struct gve_tx_fifo { struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */ }; -/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */ +/* TX descriptor for DQO format */ +union gve_tx_desc_dqo { + struct gve_tx_pkt_desc_dqo pkt; + struct gve_tx_tso_context_desc_dqo tso_ctx; + struct gve_tx_general_context_desc_dqo general_ctx; +}; + +enum gve_packet_state { + /* Packet is in free list, available to be allocated. + * This should always be zero since state is not explicitly initialized. + */ + GVE_PACKET_STATE_UNALLOCATED, + /* Packet is expecting a regular data completion or miss completion */ + GVE_PACKET_STATE_PENDING_DATA_COMPL, + /* Packet has received a miss completion and is expecting a + * re-injection completion. + */ + GVE_PACKET_STATE_PENDING_REINJECT_COMPL, + /* No valid completion received within the specified timeout. */ + GVE_PACKET_STATE_TIMED_OUT_COMPL, +}; + +struct gve_tx_pending_packet_dqo { + struct sk_buff *skb; /* skb for this packet */ + + /* 0th element corresponds to the linear portion of `skb`, should be + * unmapped with `dma_unmap_single`. + * + * All others correspond to `skb`'s frags and should be unmapped with + * `dma_unmap_page`. + */ + struct gve_tx_dma_buf bufs[MAX_SKB_FRAGS + 1]; + u16 num_bufs; + + /* Linked list index to next element in the list, or -1 if none */ + s16 next; + + /* Linked list index to prev element in the list, or -1 if none. + * Used for tracking either outstanding miss completions or prematurely + * freed packets. + */ + s16 prev; + + /* Identifies the current state of the packet as defined in + * `enum gve_packet_state`. + */ + u8 state; + + /* If packet is an outstanding miss completion, then the packet is + * freed if the corresponding re-injection completion is not received + * before kernel jiffies exceeds timeout_jiffies. + */ + unsigned long timeout_jiffies; +}; + +/* Contains datapath state used to represent a TX queue. */ struct gve_tx_ring { /* Cacheline 0 -- Accessed & dirtied during transmit */ - struct gve_tx_fifo tx_fifo; - u32 req; /* driver tracked head pointer */ - u32 done; /* driver tracked tail pointer */ + union { + /* GQI fields */ + struct { + struct gve_tx_fifo tx_fifo; + u32 req; /* driver tracked head pointer */ + u32 done; /* driver tracked tail pointer */ + }; + + /* DQO fields. */ + struct { + /* Linked list of gve_tx_pending_packet_dqo. Index into + * pending_packets, or -1 if empty. + * + * This is a consumer list owned by the TX path. When it + * runs out, the producer list is stolen from the + * completion handling path + * (dqo_compl.free_pending_packets). + */ + s16 free_pending_packets; + + /* Cached value of `dqo_compl.hw_tx_head` */ + u32 head; + u32 tail; /* Last posted buffer index + 1 */ + + /* Index of the last descriptor with "report event" bit + * set. + */ + u32 last_re_idx; + } dqo_tx; + }; /* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */ - __be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */ + union { + /* GQI fields */ + struct { + /* NIC tail pointer */ + __be32 last_nic_done; + }; + + /* DQO fields. */ + struct { + u32 head; /* Last read on compl_desc */ + + /* Tracks the current gen bit of compl_q */ + u8 cur_gen_bit; + + /* Linked list of gve_tx_pending_packet_dqo. Index into + * pending_packets, or -1 if empty. + * + * This is the producer list, owned by the completion + * handling path. When the consumer list + * (dqo_tx.free_pending_packets) is runs out, this list + * will be stolen. + */ + atomic_t free_pending_packets; + + /* Last TX ring index fetched by HW */ + atomic_t hw_tx_head; + + /* List to track pending packets which received a miss + * completion but not a corresponding reinjection. + */ + struct gve_index_list miss_completions; + + /* List to track pending packets that were completed + * before receiving a valid completion because they + * reached a specified timeout. + */ + struct gve_index_list timed_out_completions; + } dqo_compl; + } ____cacheline_aligned; u64 pkt_done; /* free-running - total packets completed */ u64 bytes_done; /* free-running - total bytes completed */ u64 dropped_pkt; /* free-running - total packets dropped */ u64 dma_mapping_error; /* count of dma mapping errors */ /* Cacheline 2 -- Read-mostly fields */ - union gve_tx_desc *desc ____cacheline_aligned; - struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */ + union { + /* GQI fields */ + struct { + union gve_tx_desc *desc; + + /* Maps 1:1 to a desc */ + struct gve_tx_buffer_state *info; + }; + + /* DQO fields. */ + struct { + union gve_tx_desc_dqo *tx_ring; + struct gve_tx_compl_desc *compl_ring; + + struct gve_tx_pending_packet_dqo *pending_packets; + s16 num_pending_packets; + + u32 complq_mask; /* complq size is complq_mask + 1 */ + } dqo; + } ____cacheline_aligned; struct netdev_queue *netdev_txq; struct gve_queue_resources *q_resources; /* head and tail pointer idx */ struct device *dev; @@ -167,6 +417,7 @@ struct gve_tx_ring { u32 ntfy_id; /* notification block index */ dma_addr_t bus; /* dma address of the descr ring */ dma_addr_t q_resources_bus; /* dma address of the queue resources */ + dma_addr_t complq_bus_dqo; /* dma address of the dqo.compl_ring */ struct u64_stats_sync statss; /* sync stats for 32bit archs */ } ____cacheline_aligned; @@ -194,6 +445,31 @@ struct gve_qpl_config { unsigned long *qpl_id_map; /* bitmap of used qpl ids */ }; +struct gve_options_dqo_rda { + u16 tx_comp_ring_entries; /* number of tx_comp descriptors */ + u16 rx_buff_ring_entries; /* number of rx_buff descriptors */ +}; + +struct gve_ptype { + u8 l3_type; /* `gve_l3_type` in gve_adminq.h */ + u8 l4_type; /* `gve_l4_type` in gve_adminq.h */ +}; + +struct gve_ptype_lut { + struct gve_ptype ptypes[GVE_NUM_PTYPES]; +}; + +/* GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value + * when the entire configure_device_resources command is zeroed out and the + * queue_format is not specified. + */ +enum gve_queue_format { + GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, + GVE_GQI_RDA_FORMAT = 0x1, + GVE_GQI_QPL_FORMAT = 0x2, + GVE_DQO_RDA_FORMAT = 0x3, +}; + struct gve_priv { struct net_device *dev; struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ @@ -216,7 +492,6 @@ struct gve_priv { u64 num_registered_pages; /* num pages registered with NIC */ u32 rx_copybreak; /* copy packets smaller than this */ u16 default_num_queues; /* default num queues to set up */ - u8 raw_addressing; /* 1 if this dev supports raw addressing, 0 otherwise */ struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; @@ -251,6 +526,7 @@ struct gve_priv { u32 adminq_set_driver_parameter_cnt; u32 adminq_report_stats_cnt; u32 adminq_report_link_speed_cnt; + u32 adminq_get_ptype_map_cnt; /* Global stats */ u32 interface_up_cnt; /* count of times interface turned up since last reset */ @@ -275,6 +551,14 @@ struct gve_priv { /* Gvnic device link speed from hypervisor. */ u64 link_speed; + + struct gve_options_dqo_rda options_dqo_rda; + struct gve_ptype_lut *ptype_lut_dqo; + + /* Must be a power of two. */ + int data_buffer_size_dqo; + + enum gve_queue_format queue_format; }; enum gve_service_task_flags_bit { @@ -454,14 +738,20 @@ static inline u32 gve_rx_idx_to_ntfy(struct gve_priv *priv, u32 queue_idx) */ static inline u32 gve_num_tx_qpls(struct gve_priv *priv) { - return priv->raw_addressing ? 0 : priv->tx_cfg.num_queues; + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return 0; + + return priv->tx_cfg.num_queues; } /* Returns the number of rx queue page lists */ static inline u32 gve_num_rx_qpls(struct gve_priv *priv) { - return priv->raw_addressing ? 0 : priv->rx_cfg.num_queues; + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return 0; + + return priv->rx_cfg.num_queues; } /* Returns a pointer to the next available tx qpl in the list of qpls @@ -515,6 +805,12 @@ static inline enum dma_data_direction gve_qpl_dma_dir(struct gve_priv *priv, return DMA_FROM_DEVICE; } +static inline bool gve_is_gqi(struct gve_priv *priv) +{ + return priv->queue_format == GVE_GQI_RDA_FORMAT || + priv->queue_format == GVE_GQI_QPL_FORMAT; +} + /* buffers */ int gve_alloc_page(struct gve_priv *priv, struct device *dev, struct page **page, dma_addr_t *dma, @@ -525,14 +821,14 @@ void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev); bool gve_tx_poll(struct gve_notify_block *block, int budget); int gve_tx_alloc_rings(struct gve_priv *priv); -void gve_tx_free_rings(struct gve_priv *priv); +void gve_tx_free_rings_gqi(struct gve_priv *priv); __be32 gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx); /* rx handling */ void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx); bool gve_rx_poll(struct gve_notify_block *block, int budget); int gve_rx_alloc_rings(struct gve_priv *priv); -void gve_rx_free_rings(struct gve_priv *priv); +void gve_rx_free_rings_gqi(struct gve_priv *priv); bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget, netdev_features_t feat); /* Reset */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 53864f200599..5bb56b454541 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include <linux/etherdevice.h> @@ -18,6 +18,8 @@ "Expected: length=%d, feature_mask=%x.\n" \ "Actual: length=%d, feature_mask=%x.\n" +#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected. Possible older version of guest driver.\n" + static struct gve_device_option *gve_get_next_option(struct gve_device_descriptor *descriptor, struct gve_device_option *option) @@ -33,28 +35,81 @@ struct gve_device_option *gve_get_next_option(struct gve_device_descriptor *desc static void gve_parse_device_option(struct gve_priv *priv, struct gve_device_descriptor *device_descriptor, - struct gve_device_option *option) + struct gve_device_option *option, + struct gve_device_option_gqi_rda **dev_op_gqi_rda, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda) { + u32 req_feat_mask = be32_to_cpu(option->required_features_mask); u16 option_length = be16_to_cpu(option->option_length); u16 option_id = be16_to_cpu(option->option_id); + /* If the length or feature mask doesn't match, continue without + * enabling the feature. + */ switch (option_id) { - case GVE_DEV_OPT_ID_RAW_ADDRESSING: - /* If the length or feature mask doesn't match, - * continue without enabling the feature. - */ - if (option_length != GVE_DEV_OPT_LEN_RAW_ADDRESSING || - option->feat_mask != cpu_to_be32(GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING)) { - dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, "Raw Addressing", - GVE_DEV_OPT_LEN_RAW_ADDRESSING, - cpu_to_be32(GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING), - option_length, option->feat_mask); - priv->raw_addressing = 0; - } else { - dev_info(&priv->pdev->dev, - "Raw addressing device option enabled.\n"); - priv->raw_addressing = 1; + case GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING: + if (option_length != GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Raw Addressing", + GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING, + option_length, req_feat_mask); + break; + } + + dev_info(&priv->pdev->dev, + "Gqi raw addressing device option enabled.\n"); + priv->queue_format = GVE_GQI_RDA_FORMAT; + break; + case GVE_DEV_OPT_ID_GQI_RDA: + if (option_length < sizeof(**dev_op_gqi_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI RDA", (int)sizeof(**dev_op_gqi_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_rda)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI RDA"); + } + *dev_op_gqi_rda = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_GQI_QPL: + if (option_length < sizeof(**dev_op_gqi_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI QPL", (int)sizeof(**dev_op_gqi_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_qpl)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "GQI QPL"); + } + *dev_op_gqi_qpl = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_DQO_RDA: + if (option_length < sizeof(**dev_op_dqo_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) { + dev_warn(&priv->pdev->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO RDA", (int)sizeof(**dev_op_dqo_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_rda)) { + dev_warn(&priv->pdev->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "DQO RDA"); } + *dev_op_dqo_rda = (void *)(option + 1); break; default: /* If we don't recognize the option just continue @@ -65,6 +120,39 @@ void gve_parse_device_option(struct gve_priv *priv, } } +/* Process all device options for a given describe device call. */ +static int +gve_process_device_options(struct gve_priv *priv, + struct gve_device_descriptor *descriptor, + struct gve_device_option_gqi_rda **dev_op_gqi_rda, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda) +{ + const int num_options = be16_to_cpu(descriptor->num_device_options); + struct gve_device_option *dev_opt; + int i; + + /* The options struct directly follows the device descriptor. */ + dev_opt = (void *)(descriptor + 1); + for (i = 0; i < num_options; i++) { + struct gve_device_option *next_opt; + + next_opt = gve_get_next_option(descriptor, dev_opt); + if (!next_opt) { + dev_err(&priv->dev->dev, + "options exceed device_descriptor's total length.\n"); + return -EINVAL; + } + + gve_parse_device_option(priv, descriptor, dev_opt, + dev_op_gqi_rda, dev_op_gqi_qpl, + dev_op_dqo_rda); + dev_opt = next_opt; + } + + return 0; +} + int gve_adminq_alloc(struct device *dev, struct gve_priv *priv) { priv->adminq = dma_alloc_coherent(dev, PAGE_SIZE, @@ -88,6 +176,7 @@ int gve_adminq_alloc(struct device *dev, struct gve_priv *priv) priv->adminq_set_driver_parameter_cnt = 0; priv->adminq_report_stats_cnt = 0; priv->adminq_report_link_speed_cnt = 0; + priv->adminq_get_ptype_map_cnt = 0; /* Setup Admin queue with the device */ iowrite32be(priv->adminq_bus_addr / PAGE_SIZE, @@ -293,6 +382,9 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv, case GVE_ADMINQ_REPORT_LINK_SPEED: priv->adminq_report_link_speed_cnt++; break; + case GVE_ADMINQ_GET_PTYPE_MAP: + priv->adminq_get_ptype_map_cnt++; + break; default: dev_err(&priv->pdev->dev, "unknown AQ command opcode %d\n", opcode); } @@ -305,7 +397,8 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv, * The caller is also responsible for making sure there are no commands * waiting to be executed. */ -static int gve_adminq_execute_cmd(struct gve_priv *priv, union gve_adminq_command *cmd_orig) +static int gve_adminq_execute_cmd(struct gve_priv *priv, + union gve_adminq_command *cmd_orig) { u32 tail, head; int err; @@ -350,6 +443,7 @@ int gve_adminq_configure_device_resources(struct gve_priv *priv, .irq_db_stride = cpu_to_be32(sizeof(priv->ntfy_blocks[0])), .ntfy_blk_msix_base_idx = cpu_to_be32(GVE_NTFY_BLK_BASE_MSIX_IDX), + .queue_format = priv->queue_format, }; return gve_adminq_execute_cmd(priv, &cmd); @@ -369,27 +463,32 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index) { struct gve_tx_ring *tx = &priv->tx[queue_index]; union gve_adminq_command cmd; - u32 qpl_id; - int err; - qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_TX_QUEUE); cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { .queue_id = cpu_to_be32(queue_index), - .reserved = 0, .queue_resources_addr = cpu_to_be64(tx->q_resources_bus), .tx_ring_addr = cpu_to_be64(tx->bus), - .queue_page_list_id = cpu_to_be32(qpl_id), .ntfy_id = cpu_to_be32(tx->ntfy_id), }; - err = gve_adminq_issue_cmd(priv, &cmd); - if (err) - return err; + if (gve_is_gqi(priv)) { + u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? + GVE_RAW_ADDRESSING_QPL_ID : tx->tx_fifo.qpl->id; + + cmd.create_tx_queue.queue_page_list_id = cpu_to_be32(qpl_id); + } else { + cmd.create_tx_queue.tx_ring_size = + cpu_to_be16(priv->tx_desc_cnt); + cmd.create_tx_queue.tx_comp_ring_addr = + cpu_to_be64(tx->complq_bus_dqo); + cmd.create_tx_queue.tx_comp_ring_size = + cpu_to_be16(priv->options_dqo_rda.tx_comp_ring_entries); + } - return 0; + return gve_adminq_issue_cmd(priv, &cmd); } int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues) @@ -410,28 +509,41 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index) { struct gve_rx_ring *rx = &priv->rx[queue_index]; union gve_adminq_command cmd; - u32 qpl_id; - int err; - qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE); cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { .queue_id = cpu_to_be32(queue_index), - .index = cpu_to_be32(queue_index), - .reserved = 0, .ntfy_id = cpu_to_be32(rx->ntfy_id), .queue_resources_addr = cpu_to_be64(rx->q_resources_bus), - .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus), - .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus), - .queue_page_list_id = cpu_to_be32(qpl_id), }; - err = gve_adminq_issue_cmd(priv, &cmd); - if (err) - return err; + if (gve_is_gqi(priv)) { + u32 qpl_id = priv->queue_format == GVE_GQI_RDA_FORMAT ? + GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id; + + cmd.create_rx_queue.rx_desc_ring_addr = + cpu_to_be64(rx->desc.bus), + cmd.create_rx_queue.rx_data_ring_addr = + cpu_to_be64(rx->data.data_bus), + cmd.create_rx_queue.index = cpu_to_be32(queue_index); + cmd.create_rx_queue.queue_page_list_id = cpu_to_be32(qpl_id); + } else { + cmd.create_rx_queue.rx_ring_size = + cpu_to_be16(priv->rx_desc_cnt); + cmd.create_rx_queue.rx_desc_ring_addr = + cpu_to_be64(rx->dqo.complq.bus); + cmd.create_rx_queue.rx_data_ring_addr = + cpu_to_be64(rx->dqo.bufq.bus); + cmd.create_rx_queue.packet_buffer_size = + cpu_to_be16(priv->data_buffer_size_dqo); + cmd.create_rx_queue.rx_buff_ring_size = + cpu_to_be16(priv->options_dqo_rda.rx_buff_ring_entries); + cmd.create_rx_queue.enable_rsc = + !!(priv->dev->features & NETIF_F_LRO); + } - return 0; + return gve_adminq_issue_cmd(priv, &cmd); } int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues) @@ -512,17 +624,51 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues) return gve_adminq_kick_and_wait(priv); } +static int gve_set_desc_cnt(struct gve_priv *priv, + struct gve_device_descriptor *descriptor) +{ + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Tx desc count %d too low\n", + priv->tx_desc_cnt); + return -EINVAL; + } + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) + < PAGE_SIZE) { + dev_err(&priv->pdev->dev, "Rx desc count %d too low\n", + priv->rx_desc_cnt); + return -EINVAL; + } + return 0; +} + +static int +gve_set_desc_cnt_dqo(struct gve_priv *priv, + const struct gve_device_descriptor *descriptor, + const struct gve_device_option_dqo_rda *dev_op_dqo_rda) +{ + priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); + priv->options_dqo_rda.tx_comp_ring_entries = + be16_to_cpu(dev_op_dqo_rda->tx_comp_ring_entries); + priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); + priv->options_dqo_rda.rx_buff_ring_entries = + be16_to_cpu(dev_op_dqo_rda->rx_buff_ring_entries); + + return 0; +} + int gve_adminq_describe_device(struct gve_priv *priv) { + struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL; + struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; struct gve_device_descriptor *descriptor; - struct gve_device_option *dev_opt; union gve_adminq_command cmd; dma_addr_t descriptor_bus; - u16 num_options; int err = 0; u8 *mac; u16 mtu; - int i; memset(&cmd, 0, sizeof(cmd)); descriptor = dma_alloc_coherent(&priv->pdev->dev, PAGE_SIZE, @@ -540,21 +686,41 @@ int gve_adminq_describe_device(struct gve_priv *priv) if (err) goto free_device_descriptor; - priv->tx_desc_cnt = be16_to_cpu(descriptor->tx_queue_entries); - if (priv->tx_desc_cnt * sizeof(priv->tx->desc[0]) < PAGE_SIZE) { - dev_err(&priv->pdev->dev, "Tx desc count %d too low\n", priv->tx_desc_cnt); - err = -EINVAL; + err = gve_process_device_options(priv, descriptor, &dev_op_gqi_rda, + &dev_op_gqi_qpl, &dev_op_dqo_rda); + if (err) goto free_device_descriptor; + + /* If the GQI_RAW_ADDRESSING option is not enabled and the queue format + * is not set to GqiRda, choose the queue format in a priority order: + * DqoRda, GqiRda, GqiQpl. Use GqiQpl as default. + */ + if (priv->queue_format == GVE_GQI_RDA_FORMAT) { + dev_info(&priv->pdev->dev, + "Driver is running with GQI RDA queue format.\n"); + } else if (dev_op_dqo_rda) { + priv->queue_format = GVE_DQO_RDA_FORMAT; + dev_info(&priv->pdev->dev, + "Driver is running with DQO RDA queue format.\n"); + } else if (dev_op_gqi_rda) { + priv->queue_format = GVE_GQI_RDA_FORMAT; + dev_info(&priv->pdev->dev, + "Driver is running with GQI RDA queue format.\n"); + } else { + priv->queue_format = GVE_GQI_QPL_FORMAT; + dev_info(&priv->pdev->dev, + "Driver is running with GQI QPL queue format.\n"); } - priv->rx_desc_cnt = be16_to_cpu(descriptor->rx_queue_entries); - if (priv->rx_desc_cnt * sizeof(priv->rx->desc.desc_ring[0]) - < PAGE_SIZE || - priv->rx_desc_cnt * sizeof(priv->rx->data.data_ring[0]) - < PAGE_SIZE) { - dev_err(&priv->pdev->dev, "Rx desc count %d too low\n", priv->rx_desc_cnt); - err = -EINVAL; - goto free_device_descriptor; + if (gve_is_gqi(priv)) { + err = gve_set_desc_cnt(priv, descriptor); + } else { + /* DQO supports LRO. */ + priv->dev->hw_features |= NETIF_F_LRO; + err = gve_set_desc_cnt_dqo(priv, descriptor, dev_op_dqo_rda); } + if (err) + goto free_device_descriptor; + priv->max_registered_pages = be64_to_cpu(descriptor->max_registered_pages); mtu = be16_to_cpu(descriptor->mtu); @@ -570,32 +736,16 @@ int gve_adminq_describe_device(struct gve_priv *priv) dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac); priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl); priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl); - if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) { + + if (gve_is_gqi(priv) && priv->rx_data_slot_cnt < priv->rx_desc_cnt) { dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n", priv->rx_data_slot_cnt); priv->rx_desc_cnt = priv->rx_data_slot_cnt; } priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues); - dev_opt = (void *)(descriptor + 1); - - num_options = be16_to_cpu(descriptor->num_device_options); - for (i = 0; i < num_options; i++) { - struct gve_device_option *next_opt; - - next_opt = gve_get_next_option(descriptor, dev_opt); - if (!next_opt) { - dev_err(&priv->dev->dev, - "options exceed device_descriptor's total length.\n"); - err = -EINVAL; - goto free_device_descriptor; - } - - gve_parse_device_option(priv, descriptor, dev_opt); - dev_opt = next_opt; - } free_device_descriptor: - dma_free_coherent(&priv->pdev->dev, sizeof(*descriptor), descriptor, + dma_free_coherent(&priv->pdev->dev, PAGE_SIZE, descriptor, descriptor_bus); return err; } @@ -701,3 +851,41 @@ int gve_adminq_report_link_speed(struct gve_priv *priv) link_speed_region_bus); return err; } + +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut) +{ + struct gve_ptype_map *ptype_map; + union gve_adminq_command cmd; + dma_addr_t ptype_map_bus; + int err = 0; + int i; + + memset(&cmd, 0, sizeof(cmd)); + ptype_map = dma_alloc_coherent(&priv->pdev->dev, sizeof(*ptype_map), + &ptype_map_bus, GFP_KERNEL); + if (!ptype_map) + return -ENOMEM; + + cmd.opcode = cpu_to_be32(GVE_ADMINQ_GET_PTYPE_MAP); + cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) { + .ptype_map_len = cpu_to_be64(sizeof(*ptype_map)), + .ptype_map_addr = cpu_to_be64(ptype_map_bus), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + if (err) + goto err; + + /* Populate ptype_lut. */ + for (i = 0; i < GVE_NUM_PTYPES; i++) { + ptype_lut->ptypes[i].l3_type = + ptype_map->ptypes[i].l3_type; + ptype_lut->ptypes[i].l4_type = + ptype_map->ptypes[i].l4_type; + } +err: + dma_free_coherent(&priv->pdev->dev, sizeof(*ptype_map), ptype_map, + ptype_map_bus); + return err; +} diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h index d320c2ffd87c..47c3d8f313fc 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.h +++ b/drivers/net/ethernet/google/gve/gve_adminq.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0 OR MIT) * Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #ifndef _GVE_ADMINQ_H @@ -22,7 +22,8 @@ enum gve_adminq_opcodes { GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, GVE_ADMINQ_REPORT_STATS = 0xC, - GVE_ADMINQ_REPORT_LINK_SPEED = 0xD + GVE_ADMINQ_REPORT_LINK_SPEED = 0xD, + GVE_ADMINQ_GET_PTYPE_MAP = 0xE, }; /* Admin queue status codes */ @@ -82,14 +83,54 @@ static_assert(sizeof(struct gve_device_descriptor) == 40); struct gve_device_option { __be16 option_id; __be16 option_length; - __be32 feat_mask; + __be32 required_features_mask; }; static_assert(sizeof(struct gve_device_option) == 8); -#define GVE_DEV_OPT_ID_RAW_ADDRESSING 0x1 -#define GVE_DEV_OPT_LEN_RAW_ADDRESSING 0x0 -#define GVE_DEV_OPT_FEAT_MASK_RAW_ADDRESSING 0x0 +struct gve_device_option_gqi_rda { + __be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_rda) == 4); + +struct gve_device_option_gqi_qpl { + __be32 supported_features_mask; +}; + +static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4); + +struct gve_device_option_dqo_rda { + __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; +}; + +static_assert(sizeof(struct gve_device_option_dqo_rda) == 8); + +/* Terminology: + * + * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA + * mapped and read/updated by the device. + * + * QPL - Queue Page Lists - Driver uses bounce buffers which are DMA mapped with + * the device for read/write and data is copied from/to SKBs. + */ +enum gve_dev_opt_id { + GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, + GVE_DEV_OPT_ID_GQI_RDA = 0x2, + GVE_DEV_OPT_ID_GQI_QPL = 0x3, + GVE_DEV_OPT_ID_DQO_RDA = 0x4, +}; + +enum gve_dev_opt_req_feat_mask { + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, +}; + +#define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0 struct gve_adminq_configure_device_resources { __be64 counter_array; @@ -98,9 +139,11 @@ struct gve_adminq_configure_device_resources { __be32 num_irq_dbs; __be32 irq_db_stride; __be32 ntfy_blk_msix_base_idx; + u8 queue_format; + u8 padding[7]; }; -static_assert(sizeof(struct gve_adminq_configure_device_resources) == 32); +static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40); struct gve_adminq_register_page_list { __be32 page_list_id; @@ -125,9 +168,13 @@ struct gve_adminq_create_tx_queue { __be64 tx_ring_addr; __be32 queue_page_list_id; __be32 ntfy_id; + __be64 tx_comp_ring_addr; + __be16 tx_ring_size; + __be16 tx_comp_ring_size; + u8 padding[4]; }; -static_assert(sizeof(struct gve_adminq_create_tx_queue) == 32); +static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48); struct gve_adminq_create_rx_queue { __be32 queue_id; @@ -138,10 +185,14 @@ struct gve_adminq_create_rx_queue { __be64 rx_desc_ring_addr; __be64 rx_data_ring_addr; __be32 queue_page_list_id; - u8 padding[4]; + __be16 rx_ring_size; + __be16 packet_buffer_size; + __be16 rx_buff_ring_size; + u8 enable_rsc; + u8 padding[5]; }; -static_assert(sizeof(struct gve_adminq_create_rx_queue) == 48); +static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56); /* Queue resources that are shared with the device */ struct gve_queue_resources { @@ -226,6 +277,41 @@ enum gve_stat_names { RX_DROPS_INVALID_CHECKSUM = 68, }; +enum gve_l3_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L3_TYPE_UNKNOWN = 0, + GVE_L3_TYPE_OTHER, + GVE_L3_TYPE_IPV4, + GVE_L3_TYPE_IPV6, +}; + +enum gve_l4_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L4_TYPE_UNKNOWN = 0, + GVE_L4_TYPE_OTHER, + GVE_L4_TYPE_TCP, + GVE_L4_TYPE_UDP, + GVE_L4_TYPE_ICMP, + GVE_L4_TYPE_SCTP, +}; + +/* These are control path types for PTYPE which are the same as the data path + * types. + */ +struct gve_ptype_entry { + u8 l3_type; + u8 l4_type; +}; + +struct gve_ptype_map { + struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ +}; + +struct gve_adminq_get_ptype_map { + __be64 ptype_map_len; + __be64 ptype_map_addr; +}; + union gve_adminq_command { struct { __be32 opcode; @@ -243,6 +329,7 @@ union gve_adminq_command { struct gve_adminq_set_driver_parameter set_driver_param; struct gve_adminq_report_stats report_stats; struct gve_adminq_report_link_speed report_link_speed; + struct gve_adminq_get_ptype_map get_ptype_map; }; }; u8 reserved[64]; @@ -271,4 +358,9 @@ int gve_adminq_set_mtu(struct gve_priv *priv, u64 mtu); int gve_adminq_report_stats(struct gve_priv *priv, u64 stats_report_len, dma_addr_t stats_report_addr, u64 interval); int gve_adminq_report_link_speed(struct gve_priv *priv); + +struct gve_ptype_lut; +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut); + #endif /* _GVE_ADMINQ_H */ diff --git a/drivers/net/ethernet/google/gve/gve_desc_dqo.h b/drivers/net/ethernet/google/gve/gve_desc_dqo.h new file mode 100644 index 000000000000..e8fe9adef7f2 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_desc_dqo.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +/* GVE DQO Descriptor formats */ + +#ifndef _GVE_DESC_DQO_H_ +#define _GVE_DESC_DQO_H_ + +#include <linux/build_bug.h> + +#define GVE_TX_MAX_HDR_SIZE_DQO 255 +#define GVE_TX_MIN_TSO_MSS_DQO 88 + +#ifndef __LITTLE_ENDIAN_BITFIELD +#error "Only little endian supported" +#endif + +/* Basic TX descriptor (DTYPE 0x0C) */ +struct gve_tx_pkt_desc_dqo { + __le64 buf_addr; + + /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ + u8 dtype: 5; + + /* Denotes the last descriptor of a packet. */ + u8 end_of_packet: 1; + u8 checksum_offload_enable: 1; + + /* If set, will generate a descriptor completion for this descriptor. */ + u8 report_event: 1; + u8 reserved0; + __le16 reserved1; + + /* The TX completion associated with this packet will contain this tag. + */ + __le16 compl_tag; + u16 buf_size: 14; + u16 reserved2: 2; +} __packed; +static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16); + +#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc +#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) + +/* Maximum number of data descriptors allowed per packet, or per-TSO segment. */ +#define GVE_TX_MAX_DATA_DESCS 10 + +/* Min gap between tail and head to avoid cacheline overlap */ +#define GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP 4 + +/* "report_event" on TX packet descriptors may only be reported on the last + * descriptor of a TX packet, and they must be spaced apart with at least this + * value. + */ +#define GVE_TX_MIN_RE_INTERVAL 32 + +struct gve_tx_context_cmd_dtype { + u8 dtype: 5; + u8 tso: 1; + u8 reserved1: 2; + + u8 reserved2; +}; + +static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2); + +/* TX Native TSO Context DTYPE (0x05) + * + * "flex" fields allow the driver to send additional packet context to HW. + */ +struct gve_tx_tso_context_desc_dqo { + /* The L4 payload bytes that should be segmented. */ + u32 tso_total_len: 24; + u32 flex10: 8; + + /* Max segment size in TSO excluding headers. */ + u16 mss: 14; + u16 reserved: 2; + + u8 header_len; /* Header length to use for TSO offload */ + u8 flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + u8 flex0; + u8 flex5; + u8 flex6; + u8 flex7; + u8 flex8; + u8 flex9; +} __packed; +static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16); + +#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 + +/* General context descriptor for sending metadata. */ +struct gve_tx_general_context_desc_dqo { + u8 flex4; + u8 flex5; + u8 flex6; + u8 flex7; + u8 flex8; + u8 flex9; + u8 flex10; + u8 flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + u16 reserved; + u8 flex0; + u8 flex1; + u8 flex2; + u8 flex3; +} __packed; +static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16); + +#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 + +/* Logical structure of metadata which is packed into context descriptor flex + * fields. + */ +struct gve_tx_metadata_dqo { + union { + struct { + u8 version; + + /* If `skb->l4_hash` is set, this value should be + * derived from `skb->hash`. + * + * A zero value means no l4_hash was associated with the + * skb. + */ + u16 path_hash: 15; + + /* Should be set to 1 if the flow associated with the + * skb had a rehash from the TCP stack. + */ + u16 rehash_event: 1; + } __packed; + u8 bytes[12]; + }; +} __packed; +static_assert(sizeof(struct gve_tx_metadata_dqo) == 12); + +#define GVE_TX_METADATA_VERSION_DQO 0 + +/* TX completion descriptor */ +struct gve_tx_compl_desc { + /* For types 0-4 this is the TX queue ID associated with this + * completion. + */ + u16 id: 11; + + /* See: GVE_COMPL_TYPE_DQO* */ + u16 type: 3; + u16 reserved0: 1; + + /* Flipped by HW to notify the descriptor is populated. */ + u16 generation: 1; + union { + /* For descriptor completions, this is the last index fetched + * by HW + 1. + */ + __le16 tx_head; + + /* For packet completions, this is the completion tag set on the + * TX packet descriptors. + */ + __le16 completion_tag; + }; + __le32 reserved1; +} __packed; +static_assert(sizeof(struct gve_tx_compl_desc) == 8); + +#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ +#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ +#define GVE_COMPL_TYPE_DQO_MISS 0x1 /* Miss path completion */ +#define GVE_COMPL_TYPE_DQO_REINJECTION 0x3 /* Re-injection completion */ + +/* Descriptor to post buffers to HW on buffer queue. */ +struct gve_rx_desc_dqo { + __le16 buf_id; /* ID returned in Rx completion descriptor */ + __le16 reserved0; + __le32 reserved1; + __le64 buf_addr; /* DMA address of the buffer */ + __le64 header_buf_addr; + __le64 reserved2; +} __packed; +static_assert(sizeof(struct gve_rx_desc_dqo) == 32); + +/* Descriptor for HW to notify SW of new packets received on RX queue. */ +struct gve_rx_compl_desc_dqo { + /* Must be 1 */ + u8 rxdid: 4; + u8 reserved0: 4; + + /* Packet originated from this system rather than the network. */ + u8 loopback: 1; + /* Set when IPv6 packet contains a destination options header or routing + * header. + */ + u8 ipv6_ex_add: 1; + /* Invalid packet was received. */ + u8 rx_error: 1; + u8 reserved1: 5; + + u16 packet_type: 10; + u16 ip_hdr_err: 1; + u16 udp_len_err: 1; + u16 raw_cs_invalid: 1; + u16 reserved2: 3; + + u16 packet_len: 14; + /* Flipped by HW to notify the descriptor is populated. */ + u16 generation: 1; + /* Should be zero. */ + u16 buffer_queue_id: 1; + + u16 header_len: 10; + u16 rsc: 1; + u16 split_header: 1; + u16 reserved3: 4; + + u8 descriptor_done: 1; + u8 end_of_packet: 1; + u8 header_buffer_overflow: 1; + u8 l3_l4_processed: 1; + u8 csum_ip_err: 1; + u8 csum_l4_err: 1; + u8 csum_external_ip_err: 1; + u8 csum_external_udp_err: 1; + + u8 status_error1; + + __le16 reserved5; + __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ + + union { + /* Packet checksum. */ + __le16 raw_cs; + /* Segment length for RSC packets. */ + __le16 rsc_seg_len; + }; + __le32 hash; + __le32 reserved6; + __le64 reserved7; +} __packed; + +static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32); + +/* Ringing the doorbell too often can hurt performance. + * + * HW requires this value to be at least 8. + */ +#define GVE_RX_BUF_THRESH_DQO 32 + +#endif /* _GVE_DESC_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h new file mode 100644 index 000000000000..836042364124 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_dqo.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_DQO_H_ +#define _GVE_DQO_H_ + +#include "gve_adminq.h" + +#define GVE_ITR_ENABLE_BIT_DQO BIT(0) +#define GVE_ITR_CLEAR_PBA_BIT_DQO BIT(1) +#define GVE_ITR_NO_UPDATE_DQO (3 << 3) + +#define GVE_ITR_INTERVAL_DQO_SHIFT 5 +#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) + +#define GVE_TX_IRQ_RATELIMIT_US_DQO 50 +#define GVE_RX_IRQ_RATELIMIT_US_DQO 20 + +/* Timeout in seconds to wait for a reinjection completion after receiving + * its corresponding miss completion. + */ +#define GVE_REINJECT_COMPL_TIMEOUT 1 + +/* Timeout in seconds to deallocate the completion tag for a packet that was + * prematurely freed for not receiving a valid completion. This should be large + * enough to rule out the possibility of receiving the corresponding valid + * completion after this interval. + */ +#define GVE_DEALLOCATE_COMPL_TIMEOUT 60 + +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev); +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean); +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget); +int gve_tx_alloc_rings_dqo(struct gve_priv *priv); +void gve_tx_free_rings_dqo(struct gve_priv *priv); +int gve_rx_alloc_rings_dqo(struct gve_priv *priv); +void gve_rx_free_rings_dqo(struct gve_priv *priv); +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, + struct napi_struct *napi); +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx); +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx); + +static inline void +gve_tx_put_doorbell_dqo(const struct gve_priv *priv, + const struct gve_queue_resources *q_resources, u32 val) +{ + u64 index; + + index = be32_to_cpu(q_resources->db_index); + iowrite32(val, &priv->db_bar2[index]); +} + +/* Builds register value to write to DQO IRQ doorbell to enable with specified + * ratelimit. + */ +static inline u32 gve_set_itr_ratelimit_dqo(u32 ratelimit_us) +{ + u32 result = GVE_ITR_ENABLE_BIT_DQO; + + /* Interval has 2us granularity. */ + ratelimit_us >>= 1; + + ratelimit_us &= GVE_ITR_INTERVAL_DQO_MASK; + result |= (ratelimit_us << GVE_ITR_INTERVAL_DQO_SHIFT); + + return result; +} + +static inline void +gve_write_irq_doorbell_dqo(const struct gve_priv *priv, + const struct gve_notify_block *block, u32 val) +{ + u32 index = be32_to_cpu(block->irq_db_index); + + iowrite32(val, &priv->db_bar2[index]); +} + +#endif /* _GVE_DQO_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 5fb05cf36b49..716e6240305d 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include <linux/ethtool.h> @@ -311,8 +311,16 @@ gve_get_ethtool_stats(struct net_device *netdev, for (ring = 0; ring < priv->tx_cfg.num_queues; ring++) { struct gve_tx_ring *tx = &priv->tx[ring]; - data[i++] = tx->req; - data[i++] = tx->done; + if (gve_is_gqi(priv)) { + data[i++] = tx->req; + data[i++] = tx->done; + } else { + /* DQO doesn't currently support + * posted/completed descriptor counts; + */ + data[i++] = 0; + data[i++] = 0; + } do { start = u64_stats_fetch_begin(&priv->tx[ring].statss); @@ -453,11 +461,16 @@ static int gve_set_tunable(struct net_device *netdev, switch (etuna->id) { case ETHTOOL_RX_COPYBREAK: + { + u32 max_copybreak = gve_is_gqi(priv) ? + (PAGE_SIZE / 2) : priv->data_buffer_size_dqo; + len = *(u32 *)value; - if (len > PAGE_SIZE / 2) + if (len > max_copybreak) return -EINVAL; priv->rx_copybreak = len; return 0; + } default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 7302498c6df3..099a2bc5ae67 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include <linux/cpumask.h> @@ -14,6 +14,7 @@ #include <linux/workqueue.h> #include <net/sch_generic.h> #include "gve.h" +#include "gve_dqo.h" #include "gve_adminq.h" #include "gve_register.h" @@ -26,6 +27,16 @@ const char gve_version_str[] = GVE_VERSION; static const char gve_version_prefix[] = GVE_VERSION_PREFIX; +static netdev_tx_t gve_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + + if (gve_is_gqi(priv)) + return gve_tx(skb, dev); + else + return gve_tx_dqo(skb, dev); +} + static void gve_get_stats(struct net_device *dev, struct rtnl_link_stats64 *s) { struct gve_priv *priv = netdev_priv(dev); @@ -155,6 +166,15 @@ static irqreturn_t gve_intr(int irq, void *arg) return IRQ_HANDLED; } +static irqreturn_t gve_intr_dqo(int irq, void *arg) +{ + struct gve_notify_block *block = arg; + + /* Interrupts are automatically masked */ + napi_schedule_irqoff(&block->napi); + return IRQ_HANDLED; +} + static int gve_napi_poll(struct napi_struct *napi, int budget) { struct gve_notify_block *block; @@ -180,7 +200,7 @@ static int gve_napi_poll(struct napi_struct *napi, int budget) /* Double check we have no extra work. * Ensure unmask synchronizes with checking for work. */ - dma_rmb(); + mb(); if (block->tx) reschedule |= gve_tx_poll(block, -1); if (block->rx) @@ -191,6 +211,54 @@ static int gve_napi_poll(struct napi_struct *napi, int budget) return 0; } +static int gve_napi_poll_dqo(struct napi_struct *napi, int budget) +{ + struct gve_notify_block *block = + container_of(napi, struct gve_notify_block, napi); + struct gve_priv *priv = block->priv; + bool reschedule = false; + int work_done = 0; + + /* Clear PCI MSI-X Pending Bit Array (PBA) + * + * This bit is set if an interrupt event occurs while the vector is + * masked. If this bit is set and we reenable the interrupt, it will + * fire again. Since we're just about to poll the queue state, we don't + * need it to fire again. + * + * Under high softirq load, it's possible that the interrupt condition + * is triggered twice before we got the chance to process it. + */ + gve_write_irq_doorbell_dqo(priv, block, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_CLEAR_PBA_BIT_DQO); + + if (block->tx) + reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); + + if (block->rx) { + work_done = gve_rx_poll_dqo(block, budget); + reschedule |= work_done == budget; + } + + if (reschedule) + return budget; + + if (likely(napi_complete_done(napi, work_done))) { + /* Enable interrupts again. + * + * We don't need to repoll afterwards because HW supports the + * PCI MSI-X PBA feature. + * + * Another interrupt would be triggered if a new event came in + * since the last one. + */ + gve_write_irq_doorbell_dqo(priv, block, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); + } + + return work_done; +} + static int gve_alloc_notify_blocks(struct gve_priv *priv) { int num_vecs_requested = priv->num_ntfy_blks + 1; @@ -220,6 +288,7 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) int vecs_left = new_num_ntfy_blks % 2; priv->num_ntfy_blks = new_num_ntfy_blks; + priv->mgmt_msix_idx = priv->num_ntfy_blks; priv->tx_cfg.max_queues = min_t(int, priv->tx_cfg.max_queues, vecs_per_type); priv->rx_cfg.max_queues = min_t(int, priv->rx_cfg.max_queues, @@ -263,7 +332,8 @@ static int gve_alloc_notify_blocks(struct gve_priv *priv) name, i); block->priv = priv; err = request_irq(priv->msix_vectors[msix_idx].vector, - gve_intr, 0, block->name, block); + gve_is_gqi(priv) ? gve_intr : gve_intr_dqo, + 0, block->name, block); if (err) { dev_err(&priv->pdev->dev, "Failed to receive msix vector %d\n", i); @@ -300,20 +370,22 @@ static void gve_free_notify_blocks(struct gve_priv *priv) { int i; - /* Free the irqs */ - for (i = 0; i < priv->num_ntfy_blks; i++) { - struct gve_notify_block *block = &priv->ntfy_blocks[i]; - int msix_idx = i; + if (priv->msix_vectors) { + /* Free the irqs */ + for (i = 0; i < priv->num_ntfy_blks; i++) { + struct gve_notify_block *block = &priv->ntfy_blocks[i]; + int msix_idx = i; - irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, - NULL); - free_irq(priv->msix_vectors[msix_idx].vector, block); + irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, + NULL); + free_irq(priv->msix_vectors[msix_idx].vector, block); + } + free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); } dma_free_coherent(&priv->pdev->dev, priv->num_ntfy_blks * sizeof(*priv->ntfy_blocks), priv->ntfy_blocks, priv->ntfy_block_bus); priv->ntfy_blocks = NULL; - free_irq(priv->msix_vectors[priv->mgmt_msix_idx].vector, priv); pci_disable_msix(priv->pdev); kvfree(priv->msix_vectors); priv->msix_vectors = NULL; @@ -343,6 +415,22 @@ static int gve_setup_device_resources(struct gve_priv *priv) err = -ENXIO; goto abort_with_stats_report; } + + if (priv->queue_format == GVE_DQO_RDA_FORMAT) { + priv->ptype_lut_dqo = kvzalloc(sizeof(*priv->ptype_lut_dqo), + GFP_KERNEL); + if (!priv->ptype_lut_dqo) { + err = -ENOMEM; + goto abort_with_stats_report; + } + err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); + if (err) { + dev_err(&priv->pdev->dev, + "Failed to get ptype map: err=%d\n", err); + goto abort_with_ptype_lut; + } + } + err = gve_adminq_report_stats(priv, priv->stats_report_len, priv->stats_report_bus, GVE_STATS_REPORT_TIMER_PERIOD); @@ -351,12 +439,17 @@ static int gve_setup_device_resources(struct gve_priv *priv) "Failed to report stats: err=%d\n", err); gve_set_device_resources_ok(priv); return 0; + +abort_with_ptype_lut: + kvfree(priv->ptype_lut_dqo); + priv->ptype_lut_dqo = NULL; abort_with_stats_report: gve_free_stats_report(priv); abort_with_ntfy_blocks: gve_free_notify_blocks(priv); abort_with_counter: gve_free_counter_array(priv); + return err; } @@ -383,17 +476,22 @@ static void gve_teardown_device_resources(struct gve_priv *priv) gve_trigger_reset(priv); } } + + kvfree(priv->ptype_lut_dqo); + priv->ptype_lut_dqo = NULL; + gve_free_counter_array(priv); gve_free_notify_blocks(priv); gve_free_stats_report(priv); gve_clear_device_resources_ok(priv); } -static void gve_add_napi(struct gve_priv *priv, int ntfy_idx) +static void gve_add_napi(struct gve_priv *priv, int ntfy_idx, + int (*gve_poll)(struct napi_struct *, int)) { struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - netif_napi_add(priv->dev, &block->napi, gve_napi_poll, + netif_napi_add(priv->dev, &block->napi, gve_poll, NAPI_POLL_WEIGHT); } @@ -473,31 +571,75 @@ static int gve_create_rings(struct gve_priv *priv) netif_dbg(priv, drv, priv->dev, "created %d rx queues\n", priv->rx_cfg.num_queues); - /* Rx data ring has been prefilled with packet buffers at queue - * allocation time. - * Write the doorbell to provide descriptor slots and packet buffers - * to the NIC. - */ - for (i = 0; i < priv->rx_cfg.num_queues; i++) - gve_rx_write_doorbell(priv, &priv->rx[i]); + if (gve_is_gqi(priv)) { + /* Rx data ring has been prefilled with packet buffers at queue + * allocation time. + * + * Write the doorbell to provide descriptor slots and packet + * buffers to the NIC. + */ + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_write_doorbell(priv, &priv->rx[i]); + } else { + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + /* Post buffers and ring doorbell. */ + gve_rx_post_buffers_dqo(&priv->rx[i]); + } + } return 0; } +static void add_napi_init_sync_stats(struct gve_priv *priv, + int (*napi_poll)(struct napi_struct *napi, + int budget)) +{ + int i; + + /* Add tx napi & init sync stats*/ + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + int ntfy_idx = gve_tx_idx_to_ntfy(priv, i); + + u64_stats_init(&priv->tx[i].statss); + priv->tx[i].ntfy_id = ntfy_idx; + gve_add_napi(priv, ntfy_idx, napi_poll); + } + /* Add rx napi & init sync stats*/ + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + int ntfy_idx = gve_rx_idx_to_ntfy(priv, i); + + u64_stats_init(&priv->rx[i].statss); + priv->rx[i].ntfy_id = ntfy_idx; + gve_add_napi(priv, ntfy_idx, napi_poll); + } +} + +static void gve_tx_free_rings(struct gve_priv *priv) +{ + if (gve_is_gqi(priv)) { + gve_tx_free_rings_gqi(priv); + } else { + gve_tx_free_rings_dqo(priv); + } +} + static int gve_alloc_rings(struct gve_priv *priv) { - int ntfy_idx; int err; - int i; /* Setup tx rings */ priv->tx = kvzalloc(priv->tx_cfg.num_queues * sizeof(*priv->tx), GFP_KERNEL); if (!priv->tx) return -ENOMEM; - err = gve_tx_alloc_rings(priv); + + if (gve_is_gqi(priv)) + err = gve_tx_alloc_rings(priv); + else + err = gve_tx_alloc_rings_dqo(priv); if (err) goto free_tx; + /* Setup rx rings */ priv->rx = kvzalloc(priv->rx_cfg.num_queues * sizeof(*priv->rx), GFP_KERNEL); @@ -505,21 +647,18 @@ static int gve_alloc_rings(struct gve_priv *priv) err = -ENOMEM; goto free_tx_queue; } - err = gve_rx_alloc_rings(priv); + + if (gve_is_gqi(priv)) + err = gve_rx_alloc_rings(priv); + else + err = gve_rx_alloc_rings_dqo(priv); if (err) goto free_rx; - /* Add tx napi & init sync stats*/ - for (i = 0; i < priv->tx_cfg.num_queues; i++) { - u64_stats_init(&priv->tx[i].statss); - ntfy_idx = gve_tx_idx_to_ntfy(priv, i); - gve_add_napi(priv, ntfy_idx); - } - /* Add rx napi & init sync stats*/ - for (i = 0; i < priv->rx_cfg.num_queues; i++) { - u64_stats_init(&priv->rx[i].statss); - ntfy_idx = gve_rx_idx_to_ntfy(priv, i); - gve_add_napi(priv, ntfy_idx); - } + + if (gve_is_gqi(priv)) + add_napi_init_sync_stats(priv, gve_napi_poll); + else + add_napi_init_sync_stats(priv, gve_napi_poll_dqo); return 0; @@ -557,6 +696,14 @@ static int gve_destroy_rings(struct gve_priv *priv) return 0; } +static void gve_rx_free_rings(struct gve_priv *priv) +{ + if (gve_is_gqi(priv)) + gve_rx_free_rings_gqi(priv); + else + gve_rx_free_rings_dqo(priv); +} + static void gve_free_rings(struct gve_priv *priv) { int ntfy_idx; @@ -678,7 +825,7 @@ static int gve_alloc_qpls(struct gve_priv *priv) int err; /* Raw addressing means no QPLs */ - if (priv->raw_addressing) + if (priv->queue_format == GVE_GQI_RDA_FORMAT) return 0; priv->qpls = kvzalloc(num_qpls * sizeof(*priv->qpls), GFP_KERNEL); @@ -722,7 +869,7 @@ static void gve_free_qpls(struct gve_priv *priv) int i; /* Raw addressing means no QPLs */ - if (priv->raw_addressing) + if (priv->queue_format == GVE_GQI_RDA_FORMAT) return; kvfree(priv->qpl_cfg.qpl_id_map); @@ -756,6 +903,7 @@ static int gve_open(struct net_device *dev) err = gve_alloc_qpls(priv); if (err) return err; + err = gve_alloc_rings(priv); if (err) goto free_qpls; @@ -770,9 +918,17 @@ static int gve_open(struct net_device *dev) err = gve_register_qpls(priv); if (err) goto reset; + + if (!gve_is_gqi(priv)) { + /* Hard code this for now. This may be tuned in the future for + * performance. + */ + priv->data_buffer_size_dqo = GVE_RX_BUFFER_SIZE_DQO; + } err = gve_create_rings(priv); if (err) goto reset; + gve_set_device_rings_ok(priv); if (gve_get_report_stats(priv)) @@ -921,14 +1077,26 @@ static void gve_turnup(struct gve_priv *priv) struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; napi_enable(&block->napi); - iowrite32be(0, gve_irq_doorbell(priv, block)); + if (gve_is_gqi(priv)) { + iowrite32be(0, gve_irq_doorbell(priv, block)); + } else { + u32 val = gve_set_itr_ratelimit_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO); + + gve_write_irq_doorbell_dqo(priv, block, val); + } } for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; napi_enable(&block->napi); - iowrite32be(0, gve_irq_doorbell(priv, block)); + if (gve_is_gqi(priv)) { + iowrite32be(0, gve_irq_doorbell(priv, block)); + } else { + u32 val = gve_set_itr_ratelimit_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO); + + gve_write_irq_doorbell_dqo(priv, block, val); + } } gve_set_napi_enabled(priv); @@ -942,12 +1110,49 @@ static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue) priv->tx_timeo_cnt++; } +static int gve_set_features(struct net_device *netdev, + netdev_features_t features) +{ + const netdev_features_t orig_features = netdev->features; + struct gve_priv *priv = netdev_priv(netdev); + int err; + + if ((netdev->features & NETIF_F_LRO) != (features & NETIF_F_LRO)) { + netdev->features ^= NETIF_F_LRO; + if (netif_carrier_ok(netdev)) { + /* To make this process as simple as possible we + * teardown the device, set the new configuration, + * and then bring the device up again. + */ + err = gve_close(netdev); + /* We have already tried to reset in close, just fail + * at this point. + */ + if (err) + goto err; + + err = gve_open(netdev); + if (err) + goto err; + } + } + + return 0; +err: + /* Reverts the change on error. */ + netdev->features = orig_features; + netif_err(priv, drv, netdev, + "Set features failed! !!! DISABLING ALL QUEUES !!!\n"); + return err; +} + static const struct net_device_ops gve_netdev_ops = { - .ndo_start_xmit = gve_tx, + .ndo_start_xmit = gve_start_xmit, .ndo_open = gve_open, .ndo_stop = gve_close, .ndo_get_stats64 = gve_get_stats, .ndo_tx_timeout = gve_tx_timeout, + .ndo_set_features = gve_set_features, }; static void gve_handle_status(struct gve_priv *priv, u32 status) @@ -991,6 +1196,15 @@ void gve_handle_report_stats(struct gve_priv *priv) /* tx stats */ if (priv->tx) { for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + u32 last_completion = 0; + u32 tx_frames = 0; + + /* DQO doesn't currently support these metrics. */ + if (gve_is_gqi(priv)) { + last_completion = priv->tx[idx].done; + tx_frames = priv->tx[idx].req; + } + do { start = u64_stats_fetch_begin(&priv->tx[idx].statss); tx_bytes = priv->tx[idx].bytes_done; @@ -1007,7 +1221,7 @@ void gve_handle_report_stats(struct gve_priv *priv) }; stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_FRAMES_SENT), - .value = cpu_to_be64(priv->tx[idx].req), + .value = cpu_to_be64(tx_frames), .queue_id = cpu_to_be32(idx), }; stats[stats_idx++] = (struct stats) { @@ -1017,7 +1231,7 @@ void gve_handle_report_stats(struct gve_priv *priv) }; stats[stats_idx++] = (struct stats) { .stat_name = cpu_to_be32(TX_LAST_COMPLETION_PROCESSED), - .value = cpu_to_be64(priv->tx[idx].done), + .value = cpu_to_be64(last_completion), .queue_id = cpu_to_be32(idx), }; } @@ -1085,7 +1299,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) if (skip_describe_device) goto setup_device; - priv->raw_addressing = false; + priv->queue_format = GVE_QUEUE_FORMAT_UNSPECIFIED; /* Get the initial information we need from the device */ err = gve_adminq_describe_device(priv); if (err) { @@ -1093,7 +1307,7 @@ static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) "Could not get device information: err=%d\n", err); goto err; } - if (priv->dev->max_mtu > PAGE_SIZE) { + if (gve_is_gqi(priv) && priv->dev->max_mtu > PAGE_SIZE) { priv->dev->max_mtu = PAGE_SIZE; err = gve_adminq_set_mtu(priv, priv->dev->mtu); if (err) { @@ -1255,7 +1469,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = pci_enable_device(pdev); if (err) - return -ENXIO; + return err; err = pci_request_regions(pdev, "gvnic-cfg"); if (err) @@ -1263,19 +1477,12 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_master(pdev); - err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (err) { dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err); goto abort_with_pci_region; } - err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (err) { - dev_err(&pdev->dev, - "Failed to set consistent dma mask: err=%d\n", err); - goto abort_with_pci_region; - } - reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0); if (!reg_bar) { dev_err(&pdev->dev, "Failed to map pci bar!\n"); @@ -1292,19 +1499,25 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gve_write_version(®_bar->driver_version); /* Get max queues to alloc etherdev */ - max_rx_queues = ioread32be(®_bar->max_tx_queues); - max_tx_queues = ioread32be(®_bar->max_rx_queues); + max_tx_queues = ioread32be(®_bar->max_tx_queues); + max_rx_queues = ioread32be(®_bar->max_rx_queues); /* Alloc and setup the netdev and priv */ dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues); if (!dev) { dev_err(&pdev->dev, "could not allocate netdev\n"); + err = -ENOMEM; goto abort_with_db_bar; } SET_NETDEV_DEV(dev, &pdev->dev); pci_set_drvdata(pdev, dev); dev->ethtool_ops = &gve_ethtool_ops; dev->netdev_ops = &gve_netdev_ops; - /* advertise features */ + + /* Set default and supported features. + * + * Features might be set in other locations as well (such as + * `gve_adminq_describe_device`). + */ dev->hw_features = NETIF_F_HIGHDMA; dev->hw_features |= NETIF_F_SG; dev->hw_features |= NETIF_F_HW_CSUM; @@ -1346,13 +1559,17 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = register_netdev(dev); if (err) - goto abort_with_wq; + goto abort_with_gve_init; dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); + dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format); gve_clear_probe_in_progress(priv); queue_work(priv->gve_wq, &priv->service_task); return 0; +abort_with_gve_init: + gve_teardown_priv_resources(priv); + abort_with_wq: destroy_workqueue(priv->gve_wq); @@ -1370,7 +1587,7 @@ abort_with_pci_region: abort_with_enabled: pci_disable_device(pdev); - return -ENXIO; + return err; } static void gve_remove(struct pci_dev *pdev) diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index bf123fe524c4..bb8261368250 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -1,21 +1,14 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include "gve.h" #include "gve_adminq.h" +#include "gve_utils.h" #include <linux/etherdevice.h> -static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) -{ - struct gve_notify_block *block = - &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; - - block->rx = NULL; -} - static void gve_rx_free_buffer(struct device *dev, struct gve_rx_slot_page_info *page_info, union gve_rx_data_slot *data_slot) @@ -137,16 +130,6 @@ alloc_err: return err; } -static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) -{ - u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); - struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - struct gve_rx_ring *rx = &priv->rx[queue_idx]; - - block->rx = rx; - rx->ntfy_id = ntfy_idx; -} - static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) { struct gve_rx_ring *rx = &priv->rx[idx]; @@ -165,7 +148,7 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx) slots = priv->rx_data_slot_cnt; rx->mask = slots - 1; - rx->data.raw_addressing = priv->raw_addressing; + rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; /* alloc rx data ring */ bytes = sizeof(*rx->data.data_ring) * slots; @@ -255,7 +238,7 @@ int gve_rx_alloc_rings(struct gve_priv *priv) return err; } -void gve_rx_free_rings(struct gve_priv *priv) +void gve_rx_free_rings_gqi(struct gve_priv *priv) { int i; @@ -279,27 +262,6 @@ static enum pkt_hash_types gve_rss_type(__be16 pkt_flags) return PKT_HASH_TYPE_L2; } -static struct sk_buff *gve_rx_copy(struct net_device *dev, - struct napi_struct *napi, - struct gve_rx_slot_page_info *page_info, - u16 len) -{ - struct sk_buff *skb = napi_alloc_skb(napi, len); - void *va = page_info->page_address + GVE_RX_PAD + - (page_info->page_offset ? PAGE_SIZE / 2 : 0); - - if (unlikely(!skb)) - return NULL; - - __skb_put(skb, len); - - skb_copy_to_linear_data(skb, va, len); - - skb->protocol = eth_type_trans(skb, dev); - - return skb; -} - static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, struct gve_rx_slot_page_info *page_info, u16 len) @@ -310,7 +272,7 @@ static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi, return NULL; skb_add_rx_frag(skb, 0, page_info->page, - (page_info->page_offset ? PAGE_SIZE / 2 : 0) + + page_info->page_offset + GVE_RX_PAD, len, PAGE_SIZE / 2); return skb; @@ -321,7 +283,7 @@ static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *sl const __be64 offset = cpu_to_be64(PAGE_SIZE / 2); /* "flip" to other packet buffer on this page */ - page_info->page_offset ^= 0x1; + page_info->page_offset ^= PAGE_SIZE / 2; *(slot_addr) ^= offset; } @@ -388,7 +350,7 @@ gve_rx_qpl(struct device *dev, struct net_device *netdev, gve_rx_flip_buff(page_info, &data_slot->qpl_offset); } } else { - skb = gve_rx_copy(netdev, napi, page_info, len); + skb = gve_rx_copy(netdev, napi, page_info, len, GVE_RX_PAD); if (skb) { u64_stats_update_begin(&rx->statss); rx->rx_copied_pkt++; @@ -430,7 +392,7 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc, if (len <= priv->rx_copybreak) { /* Just copy small packets */ - skb = gve_rx_copy(dev, napi, page_info, len); + skb = gve_rx_copy(dev, napi, page_info, len, GVE_RX_PAD); u64_stats_update_begin(&rx->statss); rx->rx_copied_pkt++; rx->rx_copybreak_pkt++; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c new file mode 100644 index 000000000000..8500621b2cd4 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_dqo.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <net/ip6_checksum.h> +#include <net/ipv6.h> +#include <net/tcp.h> + +static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ + return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +static void gve_free_page_dqo(struct gve_priv *priv, + struct gve_rx_buf_state_dqo *bs) +{ + page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); + gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, + DMA_FROM_DEVICE); + bs->page_info.page = NULL; +} + +static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = rx->dqo.free_buf_states; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from free list */ + rx->dqo.free_buf_states = buf_state->next; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + return buf_state->next == buffer_id; +} + +static void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = rx->dqo.free_buf_states; + rx->dqo.free_buf_states = buffer_id; +} + +static struct gve_rx_buf_state_dqo * +gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = list->head; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from list */ + list->head = buf_state->next; + if (buf_state->next == -1) + list->tail = -1; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +static void gve_enqueue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = -1; + + if (list->head == -1) { + list->head = buffer_id; + list->tail = buffer_id; + } else { + int tail = list->tail; + + rx->dqo.buf_states[tail].next = buffer_id; + list->tail = buffer_id; + } +} + +static struct gve_rx_buf_state_dqo * +gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + int i; + + /* Recycled buf states are immediately usable. */ + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); + if (likely(buf_state)) + return buf_state; + + if (unlikely(rx->dqo.used_buf_states.head == -1)) + return NULL; + + /* Used buf states are only usable when ref count reaches 0, which means + * no SKBs refer to them. + * + * Search a limited number before giving up. + */ + for (i = 0; i < 5; i++) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + } + + /* If there are no free buf states discard an entry from + * `used_buf_states` so it can be used. + */ + if (unlikely(rx->dqo.free_buf_states == -1)) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_free_page_dqo(rx->gve, buf_state); + gve_free_buf_state(rx, buf_state); + } + + return NULL; +} + +static int gve_alloc_page_dqo(struct gve_priv *priv, + struct gve_rx_buf_state_dqo *buf_state) +{ + int err; + + err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, + &buf_state->addr, DMA_FROM_DEVICE); + if (err) + return err; + + buf_state->page_info.page_offset = 0; + buf_state->page_info.page_address = + page_address(buf_state->page_info.page); + buf_state->last_single_ref_offset = 0; + + /* The page already has 1 ref. */ + page_ref_add(buf_state->page_info.page, INT_MAX - 1); + buf_state->page_info.pagecnt_bias = INT_MAX; + + return 0; +} + +static void gve_rx_free_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t completion_queue_slots; + size_t buffer_queue_slots; + size_t size; + int i; + + completion_queue_slots = rx->dqo.complq.mask + 1; + buffer_queue_slots = rx->dqo.bufq.mask + 1; + + gve_rx_remove_from_block(priv, idx); + + if (rx->q_resources) { + dma_free_coherent(hdev, sizeof(*rx->q_resources), + rx->q_resources, rx->q_resources_bus); + rx->q_resources = NULL; + } + + for (i = 0; i < rx->dqo.num_buf_states; i++) { + struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; + + if (bs->page_info.page) + gve_free_page_dqo(priv, bs); + } + + if (rx->dqo.bufq.desc_ring) { + size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; + dma_free_coherent(hdev, size, rx->dqo.bufq.desc_ring, + rx->dqo.bufq.bus); + rx->dqo.bufq.desc_ring = NULL; + } + + if (rx->dqo.complq.desc_ring) { + size = sizeof(rx->dqo.complq.desc_ring[0]) * + completion_queue_slots; + dma_free_coherent(hdev, size, rx->dqo.complq.desc_ring, + rx->dqo.complq.bus); + rx->dqo.complq.desc_ring = NULL; + } + + kvfree(rx->dqo.buf_states); + rx->dqo.buf_states = NULL; + + netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); +} + +static int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_rx_ring *rx = &priv->rx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t size; + int i; + + const u32 buffer_queue_slots = + priv->options_dqo_rda.rx_buff_ring_entries; + const u32 completion_queue_slots = priv->rx_desc_cnt; + + netif_dbg(priv, drv, priv->dev, "allocating rx ring DQO\n"); + + memset(rx, 0, sizeof(*rx)); + rx->gve = priv; + rx->q_num = idx; + rx->dqo.bufq.mask = buffer_queue_slots - 1; + rx->dqo.complq.num_free_slots = completion_queue_slots; + rx->dqo.complq.mask = completion_queue_slots - 1; + rx->skb_head = NULL; + rx->skb_tail = NULL; + + rx->dqo.num_buf_states = min_t(s16, S16_MAX, buffer_queue_slots * 4); + rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, + sizeof(rx->dqo.buf_states[0]), + GFP_KERNEL); + if (!rx->dqo.buf_states) + return -ENOMEM; + + /* Set up linked list of buffer IDs */ + for (i = 0; i < rx->dqo.num_buf_states - 1; i++) + rx->dqo.buf_states[i].next = i + 1; + + rx->dqo.buf_states[rx->dqo.num_buf_states - 1].next = -1; + rx->dqo.recycled_buf_states.head = -1; + rx->dqo.recycled_buf_states.tail = -1; + rx->dqo.used_buf_states.head = -1; + rx->dqo.used_buf_states.tail = -1; + + /* Allocate RX completion queue */ + size = sizeof(rx->dqo.complq.desc_ring[0]) * + completion_queue_slots; + rx->dqo.complq.desc_ring = + dma_alloc_coherent(hdev, size, &rx->dqo.complq.bus, GFP_KERNEL); + if (!rx->dqo.complq.desc_ring) + goto err; + + /* Allocate RX buffer queue */ + size = sizeof(rx->dqo.bufq.desc_ring[0]) * buffer_queue_slots; + rx->dqo.bufq.desc_ring = + dma_alloc_coherent(hdev, size, &rx->dqo.bufq.bus, GFP_KERNEL); + if (!rx->dqo.bufq.desc_ring) + goto err; + + rx->q_resources = dma_alloc_coherent(hdev, sizeof(*rx->q_resources), + &rx->q_resources_bus, GFP_KERNEL); + if (!rx->q_resources) + goto err; + + gve_rx_add_to_block(priv, idx); + + return 0; + +err: + gve_rx_free_ring_dqo(priv, idx); + return -ENOMEM; +} + +void gve_rx_write_doorbell_dqo(const struct gve_priv *priv, int queue_idx) +{ + const struct gve_rx_ring *rx = &priv->rx[queue_idx]; + u64 index = be32_to_cpu(rx->q_resources->db_index); + + iowrite32(rx->dqo.bufq.tail, &priv->db_bar2[index]); +} + +int gve_rx_alloc_rings_dqo(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_rx_alloc_ring_dqo(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc rx ring=%d: err=%d\n", + i, err); + goto err; + } + } + + return 0; + +err: + for (i--; i >= 0; i--) + gve_rx_free_ring_dqo(priv, i); + + return err; +} + +void gve_rx_free_rings_dqo(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_free_ring_dqo(priv, i); +} + +void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) +{ + struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; + struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; + struct gve_priv *priv = rx->gve; + u32 num_avail_slots; + u32 num_full_slots; + u32 num_posted = 0; + + num_full_slots = (bufq->tail - bufq->head) & bufq->mask; + num_avail_slots = bufq->mask - num_full_slots; + + num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); + while (num_posted < num_avail_slots) { + struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; + struct gve_rx_buf_state_dqo *buf_state; + + buf_state = gve_get_recycled_buf_state(rx); + if (unlikely(!buf_state)) { + buf_state = gve_alloc_buf_state(rx); + if (unlikely(!buf_state)) + break; + + if (unlikely(gve_alloc_page_dqo(priv, buf_state))) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + gve_free_buf_state(rx, buf_state); + break; + } + } + + desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); + desc->buf_addr = cpu_to_le64(buf_state->addr + + buf_state->page_info.page_offset); + + bufq->tail = (bufq->tail + 1) & bufq->mask; + complq->num_free_slots--; + num_posted++; + + if ((bufq->tail & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) + gve_rx_write_doorbell_dqo(priv, rx->q_num); + } + + rx->fill_cnt += num_posted; +} + +static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + const int data_buffer_size = priv->data_buffer_size_dqo; + int pagecount; + + /* Can't reuse if we only fit one buffer per page */ + if (data_buffer_size * 2 > PAGE_SIZE) + goto mark_used; + + pagecount = gve_buf_ref_cnt(buf_state); + + /* Record the offset when we have a single remaining reference. + * + * When this happens, we know all of the other offsets of the page are + * usable. + */ + if (pagecount == 1) { + buf_state->last_single_ref_offset = + buf_state->page_info.page_offset; + } + + /* Use the next buffer sized chunk in the page. */ + buf_state->page_info.page_offset += data_buffer_size; + buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + + /* If we wrap around to the same offset without ever dropping to 1 + * reference, then we don't know if this offset was ever freed. + */ + if (buf_state->page_info.page_offset == + buf_state->last_single_ref_offset) { + goto mark_used; + } + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return; + +mark_used: + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); +} + +static void gve_rx_skb_csum(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype ptype) +{ + skb->ip_summed = CHECKSUM_NONE; + + /* HW did not identify and process L3 and L4 headers. */ + if (unlikely(!desc->l3_l4_processed)) + return; + + if (ptype.l3_type == GVE_L3_TYPE_IPV4) { + if (unlikely(desc->csum_ip_err || desc->csum_external_ip_err)) + return; + } else if (ptype.l3_type == GVE_L3_TYPE_IPV6) { + /* Checksum should be skipped if this flag is set. */ + if (unlikely(desc->ipv6_ex_add)) + return; + } + + if (unlikely(desc->csum_l4_err)) + return; + + switch (ptype.l4_type) { + case GVE_L4_TYPE_TCP: + case GVE_L4_TYPE_UDP: + case GVE_L4_TYPE_ICMP: + case GVE_L4_TYPE_SCTP: + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + default: + break; + } +} + +static void gve_rx_skb_hash(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *compl_desc, + struct gve_ptype ptype) +{ + enum pkt_hash_types hash_type = PKT_HASH_TYPE_L2; + + if (ptype.l4_type != GVE_L4_TYPE_UNKNOWN) + hash_type = PKT_HASH_TYPE_L4; + else if (ptype.l3_type != GVE_L3_TYPE_UNKNOWN) + hash_type = PKT_HASH_TYPE_L3; + + skb_set_hash(skb, le32_to_cpu(compl_desc->hash), hash_type); +} + +static void gve_rx_free_skb(struct gve_rx_ring *rx) +{ + if (!rx->skb_head) + return; + + dev_kfree_skb_any(rx->skb_head); + rx->skb_head = NULL; + rx->skb_tail = NULL; +} + +/* Chains multi skbs for single rx packet. + * Returns 0 if buffer is appended, -1 otherwise. + */ +static int gve_rx_append_frags(struct napi_struct *napi, + struct gve_rx_buf_state_dqo *buf_state, + u16 buf_len, struct gve_rx_ring *rx, + struct gve_priv *priv) +{ + int num_frags = skb_shinfo(rx->skb_tail)->nr_frags; + + if (unlikely(num_frags == MAX_SKB_FRAGS)) { + struct sk_buff *skb; + + skb = napi_alloc_skb(napi, 0); + if (!skb) + return -1; + + skb_shinfo(rx->skb_tail)->frag_list = skb; + rx->skb_tail = skb; + num_frags = 0; + } + if (rx->skb_tail != rx->skb_head) { + rx->skb_head->len += buf_len; + rx->skb_head->data_len += buf_len; + rx->skb_head->truesize += priv->data_buffer_size_dqo; + } + + skb_add_rx_frag(rx->skb_tail, num_frags, + buf_state->page_info.page, + buf_state->page_info.page_offset, + buf_len, priv->data_buffer_size_dqo); + gve_dec_pagecnt_bias(&buf_state->page_info); + + return 0; +} + +/* Returns 0 if descriptor is completed successfully. + * Returns -EINVAL if descriptor is invalid. + * Returns -ENOMEM if data cannot be copied to skb. + */ +static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, + const struct gve_rx_compl_desc_dqo *compl_desc, + int queue_idx) +{ + const u16 buffer_id = le16_to_cpu(compl_desc->buf_id); + const bool eop = compl_desc->end_of_packet != 0; + struct gve_rx_buf_state_dqo *buf_state; + struct gve_priv *priv = rx->gve; + u16 buf_len; + + if (unlikely(buffer_id >= rx->dqo.num_buf_states)) { + net_err_ratelimited("%s: Invalid RX buffer_id=%u\n", + priv->dev->name, buffer_id); + return -EINVAL; + } + buf_state = &rx->dqo.buf_states[buffer_id]; + if (unlikely(!gve_buf_state_is_allocated(rx, buf_state))) { + net_err_ratelimited("%s: RX buffer_id is not allocated: %u\n", + priv->dev->name, buffer_id); + return -EINVAL; + } + + if (unlikely(compl_desc->rx_error)) { + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + return -EINVAL; + } + + buf_len = compl_desc->packet_len; + + /* Page might have not been used for awhile and was likely last written + * by a different thread. + */ + prefetch(buf_state->page_info.page); + + /* Sync the portion of dma buffer for CPU to read. */ + dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, + buf_state->page_info.page_offset, + buf_len, DMA_FROM_DEVICE); + + /* Append to current skb if one exists. */ + if (rx->skb_head) { + if (unlikely(gve_rx_append_frags(napi, buf_state, buf_len, rx, + priv)) != 0) { + goto error; + } + + gve_try_recycle_buf(priv, rx, buf_state); + return 0; + } + + if (eop && buf_len <= priv->rx_copybreak) { + rx->skb_head = gve_rx_copy(priv->dev, napi, + &buf_state->page_info, buf_len, 0); + if (unlikely(!rx->skb_head)) + goto error; + rx->skb_tail = rx->skb_head; + + u64_stats_update_begin(&rx->statss); + rx->rx_copied_pkt++; + rx->rx_copybreak_pkt++; + u64_stats_update_end(&rx->statss); + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + return 0; + } + + rx->skb_head = napi_get_frags(napi); + if (unlikely(!rx->skb_head)) + goto error; + rx->skb_tail = rx->skb_head; + + skb_add_rx_frag(rx->skb_head, 0, buf_state->page_info.page, + buf_state->page_info.page_offset, buf_len, + priv->data_buffer_size_dqo); + gve_dec_pagecnt_bias(&buf_state->page_info); + + gve_try_recycle_buf(priv, rx, buf_state); + return 0; + +error: + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return -ENOMEM; +} + +static int gve_rx_complete_rsc(struct sk_buff *skb, + const struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype ptype) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + /* Only TCP is supported right now. */ + if (ptype.l4_type != GVE_L4_TYPE_TCP) + return -EINVAL; + + switch (ptype.l3_type) { + case GVE_L3_TYPE_IPV4: + shinfo->gso_type = SKB_GSO_TCPV4; + break; + case GVE_L3_TYPE_IPV6: + shinfo->gso_type = SKB_GSO_TCPV6; + break; + default: + return -EINVAL; + } + + shinfo->gso_size = le16_to_cpu(desc->rsc_seg_len); + return 0; +} + +/* Returns 0 if skb is completed successfully, -1 otherwise. */ +static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, + const struct gve_rx_compl_desc_dqo *desc, + netdev_features_t feat) +{ + struct gve_ptype ptype = + rx->gve->ptype_lut_dqo->ptypes[desc->packet_type]; + int err; + + skb_record_rx_queue(rx->skb_head, rx->q_num); + + if (feat & NETIF_F_RXHASH) + gve_rx_skb_hash(rx->skb_head, desc, ptype); + + if (feat & NETIF_F_RXCSUM) + gve_rx_skb_csum(rx->skb_head, desc, ptype); + + /* RSC packets must set gso_size otherwise the TCP stack will complain + * that packets are larger than MTU. + */ + if (desc->rsc) { + err = gve_rx_complete_rsc(rx->skb_head, desc, ptype); + if (err < 0) + return err; + } + + if (skb_headlen(rx->skb_head) == 0) + napi_gro_frags(napi); + else + napi_gro_receive(napi, rx->skb_head); + + return 0; +} + +int gve_rx_poll_dqo(struct gve_notify_block *block, int budget) +{ + struct napi_struct *napi = &block->napi; + netdev_features_t feat = napi->dev->features; + + struct gve_rx_ring *rx = block->rx; + struct gve_rx_compl_queue_dqo *complq = &rx->dqo.complq; + + u32 work_done = 0; + u64 bytes = 0; + int err; + + while (work_done < budget) { + struct gve_rx_compl_desc_dqo *compl_desc = + &complq->desc_ring[complq->head]; + u32 pkt_bytes; + + /* No more new packets */ + if (compl_desc->generation == complq->cur_gen_bit) + break; + + /* Prefetch the next two descriptors. */ + prefetch(&complq->desc_ring[(complq->head + 1) & complq->mask]); + prefetch(&complq->desc_ring[(complq->head + 2) & complq->mask]); + + /* Do not read data until we own the descriptor */ + dma_rmb(); + + err = gve_rx_dqo(napi, rx, compl_desc, rx->q_num); + if (err < 0) { + gve_rx_free_skb(rx); + u64_stats_update_begin(&rx->statss); + if (err == -ENOMEM) + rx->rx_skb_alloc_fail++; + else if (err == -EINVAL) + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + } + + complq->head = (complq->head + 1) & complq->mask; + complq->num_free_slots++; + + /* When the ring wraps, the generation bit is flipped. */ + complq->cur_gen_bit ^= (complq->head == 0); + + /* Receiving a completion means we have space to post another + * buffer on the buffer queue. + */ + { + struct gve_rx_buf_queue_dqo *bufq = &rx->dqo.bufq; + + bufq->head = (bufq->head + 1) & bufq->mask; + } + + /* Free running counter of completed descriptors */ + rx->cnt++; + + if (!rx->skb_head) + continue; + + if (!compl_desc->end_of_packet) + continue; + + work_done++; + pkt_bytes = rx->skb_head->len; + /* The ethernet header (first ETH_HLEN bytes) is snipped off + * by eth_type_trans. + */ + if (skb_headlen(rx->skb_head)) + pkt_bytes += ETH_HLEN; + + /* gve_rx_complete_skb() will consume skb if successful */ + if (gve_rx_complete_skb(rx, napi, compl_desc, feat) != 0) { + gve_rx_free_skb(rx); + u64_stats_update_begin(&rx->statss); + rx->rx_desc_err_dropped_pkt++; + u64_stats_update_end(&rx->statss); + continue; + } + + bytes += pkt_bytes; + rx->skb_head = NULL; + rx->skb_tail = NULL; + } + + gve_rx_post_buffers_dqo(rx); + + u64_stats_update_begin(&rx->statss); + rx->rpackets += work_done; + rx->rbytes += bytes; + u64_stats_update_end(&rx->statss); + + return work_done; +} diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 6938f3a939d6..665ac795a1ad 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Google virtual Ethernet (gve) driver * - * Copyright (C) 2015-2019 Google, Inc. + * Copyright (C) 2015-2021 Google, Inc. */ #include "gve.h" #include "gve_adminq.h" +#include "gve_utils.h" #include <linux/ip.h> #include <linux/tcp.h> #include <linux/vmalloc.h> @@ -131,14 +132,6 @@ static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) atomic_add(bytes, &fifo->available); } -static void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) -{ - struct gve_notify_block *block = - &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; - - block->tx = NULL; -} - static int gve_clean_tx_done(struct gve_priv *priv, struct gve_tx_ring *tx, u32 to_do, bool try_to_wake); @@ -174,16 +167,6 @@ static void gve_tx_free_ring(struct gve_priv *priv, int idx) netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); } -static void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) -{ - int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); - struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; - struct gve_tx_ring *tx = &priv->tx[queue_idx]; - - block->tx = tx; - tx->ntfy_id = ntfy_idx; -} - static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) { struct gve_tx_ring *tx = &priv->tx[idx]; @@ -208,14 +191,15 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) if (!tx->desc) goto abort_with_info; - tx->raw_addressing = priv->raw_addressing; + tx->raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT; tx->dev = &priv->pdev->dev; if (!tx->raw_addressing) { tx->tx_fifo.qpl = gve_assign_tx_qpl(priv); - + if (!tx->tx_fifo.qpl) + goto abort_with_desc; /* map Tx FIFO */ if (gve_tx_fifo_init(priv, &tx->tx_fifo)) - goto abort_with_desc; + goto abort_with_qpl; } tx->q_resources = @@ -236,6 +220,9 @@ static int gve_tx_alloc_ring(struct gve_priv *priv, int idx) abort_with_fifo: if (!tx->raw_addressing) gve_tx_fifo_release(priv, &tx->tx_fifo); +abort_with_qpl: + if (!tx->raw_addressing) + gve_unassign_qpl(priv, tx->tx_fifo.qpl->id); abort_with_desc: dma_free_coherent(hdev, bytes, tx->desc, tx->bus); tx->desc = NULL; @@ -269,7 +256,7 @@ int gve_tx_alloc_rings(struct gve_priv *priv) return err; } -void gve_tx_free_rings(struct gve_priv *priv) +void gve_tx_free_rings_gqi(struct gve_priv *priv) { int i; @@ -589,7 +576,7 @@ netdev_tx_t gve_tx(struct sk_buff *skb, struct net_device *dev) struct gve_tx_ring *tx; int nsegs; - WARN(skb_get_queue_mapping(skb) > priv->tx_cfg.num_queues, + WARN(skb_get_queue_mapping(skb) >= priv->tx_cfg.num_queues, "skb queue index out of range"); tx = &priv->tx[skb_get_queue_mapping(skb)]; if (unlikely(gve_maybe_stop_tx(tx, skb))) { diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c new file mode 100644 index 000000000000..05ddb6a75c38 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -0,0 +1,1030 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" +#include "gve_dqo.h" +#include <linux/tcp.h> +#include <linux/slab.h> +#include <linux/skbuff.h> + +/* Returns true if a gve_tx_pending_packet_dqo object is available. */ +static bool gve_has_pending_packet(struct gve_tx_ring *tx) +{ + /* Check TX path's list. */ + if (tx->dqo_tx.free_pending_packets != -1) + return true; + + /* Check completion handler's list. */ + if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1) + return true; + + return false; +} + +static struct gve_tx_pending_packet_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 index; + + index = tx->dqo_tx.free_pending_packets; + + /* No pending_packets available, try to steal the list from the + * completion handler. + */ + if (unlikely(index == -1)) { + tx->dqo_tx.free_pending_packets = + atomic_xchg(&tx->dqo_compl.free_pending_packets, -1); + index = tx->dqo_tx.free_pending_packets; + + if (unlikely(index == -1)) + return NULL; + } + + pending_packet = &tx->dqo.pending_packets[index]; + + /* Remove pending_packet from free list */ + tx->dqo_tx.free_pending_packets = pending_packet->next; + pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + return pending_packet; +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 index = pending_packet - tx->dqo.pending_packets; + + pending_packet->state = GVE_PACKET_STATE_UNALLOCATED; + while (true) { + s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets); + + pending_packet->next = old_head; + if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets, + old_head, index) == old_head) { + break; + } + } +} + +/* gve_tx_free_desc - Cleans up all pending tx requests and buffers. + */ +static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx) +{ + int i; + + for (i = 0; i < tx->dqo.num_pending_packets; i++) { + struct gve_tx_pending_packet_dqo *cur_state = + &tx->dqo.pending_packets[i]; + int j; + + for (j = 0; j < cur_state->num_bufs; j++) { + struct gve_tx_dma_buf *buf = &cur_state->bufs[j]; + + if (j == 0) { + dma_unmap_single(tx->dev, + dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), + DMA_TO_DEVICE); + } else { + dma_unmap_page(tx->dev, + dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), + DMA_TO_DEVICE); + } + } + if (cur_state->skb) { + dev_consume_skb_any(cur_state->skb); + cur_state->skb = NULL; + } + } +} + +static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + size_t bytes; + + gve_tx_remove_from_block(priv, idx); + + if (tx->q_resources) { + dma_free_coherent(hdev, sizeof(*tx->q_resources), + tx->q_resources, tx->q_resources_bus); + tx->q_resources = NULL; + } + + if (tx->dqo.compl_ring) { + bytes = sizeof(tx->dqo.compl_ring[0]) * + (tx->dqo.complq_mask + 1); + dma_free_coherent(hdev, bytes, tx->dqo.compl_ring, + tx->complq_bus_dqo); + tx->dqo.compl_ring = NULL; + } + + if (tx->dqo.tx_ring) { + bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); + dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus); + tx->dqo.tx_ring = NULL; + } + + kvfree(tx->dqo.pending_packets); + tx->dqo.pending_packets = NULL; + + netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx); +} + +static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx) +{ + struct gve_tx_ring *tx = &priv->tx[idx]; + struct device *hdev = &priv->pdev->dev; + int num_pending_packets; + size_t bytes; + int i; + + memset(tx, 0, sizeof(*tx)); + tx->q_num = idx; + tx->dev = &priv->pdev->dev; + tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx); + atomic_set_release(&tx->dqo_compl.hw_tx_head, 0); + + /* Queue sizes must be a power of 2 */ + tx->mask = priv->tx_desc_cnt - 1; + tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1; + + /* The max number of pending packets determines the maximum number of + * descriptors which maybe written to the completion queue. + * + * We must set the number small enough to make sure we never overrun the + * completion queue. + */ + num_pending_packets = tx->dqo.complq_mask + 1; + + /* Reserve space for descriptor completions, which will be reported at + * most every GVE_TX_MIN_RE_INTERVAL packets. + */ + num_pending_packets -= + (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL; + + /* Each packet may have at most 2 buffer completions if it receives both + * a miss and reinjection completion. + */ + num_pending_packets /= 2; + + tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX); + tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets, + sizeof(tx->dqo.pending_packets[0]), + GFP_KERNEL); + if (!tx->dqo.pending_packets) + goto err; + + /* Set up linked list of pending packets */ + for (i = 0; i < tx->dqo.num_pending_packets - 1; i++) + tx->dqo.pending_packets[i].next = i + 1; + + tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1; + atomic_set_release(&tx->dqo_compl.free_pending_packets, -1); + tx->dqo_compl.miss_completions.head = -1; + tx->dqo_compl.miss_completions.tail = -1; + tx->dqo_compl.timed_out_completions.head = -1; + tx->dqo_compl.timed_out_completions.tail = -1; + + bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1); + tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL); + if (!tx->dqo.tx_ring) + goto err; + + bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1); + tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes, + &tx->complq_bus_dqo, + GFP_KERNEL); + if (!tx->dqo.compl_ring) + goto err; + + tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources), + &tx->q_resources_bus, GFP_KERNEL); + if (!tx->q_resources) + goto err; + + gve_tx_add_to_block(priv, idx); + + return 0; + +err: + gve_tx_free_ring_dqo(priv, idx); + return -ENOMEM; +} + +int gve_tx_alloc_rings_dqo(struct gve_priv *priv) +{ + int err = 0; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_tx_alloc_ring_dqo(priv, i); + if (err) { + netif_err(priv, drv, priv->dev, + "Failed to alloc tx ring=%d: err=%d\n", + i, err); + goto err; + } + } + + return 0; + +err: + for (i--; i >= 0; i--) + gve_tx_free_ring_dqo(priv, i); + + return err; +} + +void gve_tx_free_rings_dqo(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + struct gve_tx_ring *tx = &priv->tx[i]; + + gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL); + netdev_tx_reset_queue(tx->netdev_txq); + gve_tx_clean_pending_packets(tx); + + gve_tx_free_ring_dqo(priv, i); + } +} + +/* Returns the number of slots available in the ring */ +static u32 num_avail_tx_slots(const struct gve_tx_ring *tx) +{ + u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask; + + return tx->mask - num_used; +} + +/* Stops the queue if available descriptors is less than 'count'. + * Return: 0 if stop is not required. + */ +static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count) +{ + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* Update cached TX head pointer */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(gve_has_pending_packet(tx) && + num_avail_tx_slots(tx) >= count)) + return 0; + + /* No space, so stop the queue */ + tx->stop_queue++; + netif_tx_stop_queue(tx->netdev_txq); + + /* Sync with restarting queue in `gve_tx_poll_dqo()` */ + mb(); + + /* After stopping queue, check if we can transmit again in order to + * avoid TOCTOU bug. + */ + tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head); + + if (likely(!gve_has_pending_packet(tx) || + num_avail_tx_slots(tx) < count)) + return -EBUSY; + + netif_tx_start_queue(tx->netdev_txq); + tx->wake_queue++; + return 0; +} + +static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb, + struct gve_tx_metadata_dqo *metadata) +{ + memset(metadata, 0, sizeof(*metadata)); + metadata->version = GVE_TX_METADATA_VERSION_DQO; + + if (skb->l4_hash) { + u16 path_hash = skb->hash ^ (skb->hash >> 16); + + path_hash &= (1 << 15) - 1; + if (unlikely(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx, + struct sk_buff *skb, u32 len, u64 addr, + s16 compl_tag, bool eop, bool is_gso) +{ + const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL; + + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.tx_ring[*desc_idx].pkt; + u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = cpu_to_le64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = checksum_offload_en, + .compl_tag = cpu_to_le16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->mask; + } +} + +/* Validates and prepares `skb` for TSO. + * + * Returns header length, or < 0 if invalid. + */ +static int gve_prep_tso(struct sk_buff *skb) +{ + struct tcphdr *tcp; + int header_len; + u32 paylen; + int err; + + /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length + * of the TSO to be <= 262143. + * + * However, we don't validate these because: + * - Hypervisor enforces a limit of 9K MTU + * - Kernel will not produce a TSO larger than 64k + */ + + if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) + return -1; + + /* Needed because we will modify header. */ + err = skb_cow_head(skb, 0); + if (err < 0) + return err; + + tcp = tcp_hdr(skb); + + /* Remove payload length from checksum. */ + paylen = skb->len - skb_transport_offset(skb); + + switch (skb_shinfo(skb)->gso_type) { + case SKB_GSO_TCPV4: + case SKB_GSO_TCPV6: + csum_replace_by_diff(&tcp->check, + (__force __wsum)htonl(paylen)); + + /* Compute length of segmentation header. */ + header_len = skb_transport_offset(skb) + tcp_hdrlen(skb); + break; + default: + return -EINVAL; + } + + if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO)) + return -EINVAL; + + return header_len; +} + +static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct sk_buff *skb, + const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = skb->len - header_len; + desc->mss = skb_shinfo(skb)->gso_size; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +/* Returns 0 on success, or < 0 on error. + * + * Before this function is called, the caller must ensure + * gve_has_pending_packet(tx) returns true. + */ +static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const bool is_gso = skb_is_gso(skb); + u32 desc_idx = tx->dqo_tx.tail; + + struct gve_tx_pending_packet_dqo *pending_packet; + struct gve_tx_metadata_dqo metadata; + s16 completion_tag; + int i; + + pending_packet = gve_alloc_pending_packet(tx); + pending_packet->skb = skb; + pending_packet->num_bufs = 0; + completion_tag = pending_packet - tx->dqo.pending_packets; + + gve_extract_tx_metadata_dqo(skb, &metadata); + if (is_gso) { + int header_len = gve_prep_tso(skb); + + if (unlikely(header_len < 0)) + goto err; + + gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx, + skb, &metadata, header_len); + desc_idx = (desc_idx + 1) & tx->mask; + } + + gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx, + &metadata); + desc_idx = (desc_idx + 1) & tx->mask; + + /* Note: HW requires that the size of a non-TSO packet be within the + * range of [17, 9728]. + * + * We don't double check because + * - We limited `netdev->min_mtu` to ETH_MIN_MTU. + * - Hypervisor won't allow MTU larger than 9216. + */ + + /* Map the linear portion of skb */ + { + struct gve_tx_dma_buf *buf = + &pending_packet->bufs[pending_packet->num_bufs]; + u32 len = skb_headlen(skb); + dma_addr_t addr; + + addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(buf, len, len); + dma_unmap_addr_set(buf, dma, addr); + ++pending_packet->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, + /*eop=*/shinfo->nr_frags == 0, is_gso); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + struct gve_tx_dma_buf *buf = + &pending_packet->bufs[pending_packet->num_bufs]; + const skb_frag_t *frag = &shinfo->frags[i]; + bool is_eop = i == (shinfo->nr_frags - 1); + u32 len = skb_frag_size(frag); + dma_addr_t addr; + + addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx->dev, addr))) + goto err; + + dma_unmap_len_set(buf, len, len); + dma_unmap_addr_set(buf, dma, addr); + ++pending_packet->num_bufs; + + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr, + completion_tag, is_eop, is_gso); + } + + /* Commit the changes to our state */ + tx->dqo_tx.tail = desc_idx; + + /* Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + { + u32 last_desc_idx = (desc_idx - 1) & tx->mask; + u32 last_report_event_interval = + (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask; + + if (unlikely(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true; + tx->dqo_tx.last_re_idx = last_desc_idx; + } + } + + return 0; + +err: + for (i = 0; i < pending_packet->num_bufs; i++) { + struct gve_tx_dma_buf *buf = &pending_packet->bufs[i]; + + if (i == 0) { + dma_unmap_single(tx->dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), + DMA_TO_DEVICE); + } else { + dma_unmap_page(tx->dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + } + } + + pending_packet->skb = NULL; + pending_packet->num_bufs = 0; + gve_free_pending_packet(tx, pending_packet); + + return -1; +} + +static int gve_num_descs_per_buf(size_t size) +{ + return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO); +} + +static int gve_num_buffer_descs_needed(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + int num_descs; + int i; + + num_descs = gve_num_descs_per_buf(skb_headlen(skb)); + + for (i = 0; i < shinfo->nr_frags; i++) { + unsigned int frag_size = skb_frag_size(&shinfo->frags[i]); + + num_descs += gve_num_descs_per_buf(frag_size); + } + + return num_descs; +} + +/* Returns true if HW is capable of sending TSO represented by `skb`. + * + * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. + * - The header is counted as one buffer for every single segment. + * - A buffer which is split between two segments is counted for both. + * - If a buffer contains both header and payload, it is counted as two buffers. + */ +static bool gve_can_send_tso(const struct sk_buff *skb) +{ + const int header_len = skb_checksum_start_offset(skb) + tcp_hdrlen(skb); + const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const int gso_size = shinfo->gso_size; + int cur_seg_num_bufs; + int cur_seg_size; + int i; + + cur_seg_size = skb_headlen(skb) - header_len; + cur_seg_num_bufs = cur_seg_size > 0; + + for (i = 0; i < shinfo->nr_frags; i++) { + if (cur_seg_size >= gso_size) { + cur_seg_size %= gso_size; + cur_seg_num_bufs = cur_seg_size > 0; + } + + if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) + return false; + + cur_seg_size += skb_frag_size(&shinfo->frags[i]); + } + + return true; +} + +/* Attempt to transmit specified SKB. + * + * Returns 0 if the SKB was transmitted or dropped. + * Returns -1 if there is not currently enough space to transmit the SKB. + */ +static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, + struct sk_buff *skb) +{ + int num_buffer_descs; + int total_num_descs; + + if (skb_is_gso(skb)) { + /* If TSO doesn't meet HW requirements, attempt to linearize the + * packet. + */ + if (unlikely(!gve_can_send_tso(skb) && + skb_linearize(skb) < 0)) { + net_err_ratelimited("%s: Failed to transmit TSO packet\n", + priv->dev->name); + goto drop; + } + + num_buffer_descs = gve_num_buffer_descs_needed(skb); + } else { + num_buffer_descs = gve_num_buffer_descs_needed(skb); + + if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { + if (unlikely(skb_linearize(skb) < 0)) + goto drop; + + num_buffer_descs = 1; + } + } + + /* Metadata + (optional TSO) + data descriptors. */ + total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs; + if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs + + GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) { + return -1; + } + + if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0)) + goto drop; + + netdev_tx_sent_queue(tx->netdev_txq, skb->len); + skb_tx_timestamp(skb); + return 0; + +drop: + tx->dropped_pkt++; + dev_kfree_skb_any(skb); + return 0; +} + +/* Transmit a given skb and ring the doorbell. */ +netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx; + + tx = &priv->tx[skb_get_queue_mapping(skb)]; + if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { + /* We need to ring the txq doorbell -- we have stopped the Tx + * queue for want of resources, but prior calls to gve_tx() + * may have added descriptors without ringing the doorbell. + */ + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); + return NETDEV_TX_BUSY; + } + + if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) + return NETDEV_TX_OK; + + gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); + return NETDEV_TX_OK; +} + +static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 old_tail, index; + + index = pending_packet - tx->dqo.pending_packets; + old_tail = list->tail; + list->tail = index; + if (old_tail == -1) + list->head = index; + else + tx->dqo.pending_packets[old_tail].next = index; + + pending_packet->next = -1; + pending_packet->prev = old_tail; +} + +static void remove_from_list(struct gve_tx_ring *tx, + struct gve_index_list *list, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + s16 prev_index, next_index; + + prev_index = pending_packet->prev; + next_index = pending_packet->next; + + if (prev_index == -1) { + /* Node is head */ + list->head = next_index; + } else { + tx->dqo.pending_packets[prev_index].next = next_index; + } + if (next_index == -1) { + /* Node is tail */ + list->tail = prev_index; + } else { + tx->dqo.pending_packets[next_index].prev = prev_index; + } +} + +static void gve_unmap_packet(struct device *dev, + struct gve_tx_pending_packet_dqo *pending_packet) +{ + struct gve_tx_dma_buf *buf; + int i; + + /* SKB linear portion is guaranteed to be mapped */ + buf = &pending_packet->bufs[0]; + dma_unmap_single(dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + for (i = 1; i < pending_packet->num_bufs; i++) { + buf = &pending_packet->bufs[i]; + dma_unmap_page(dev, dma_unmap_addr(buf, dma), + dma_unmap_len(buf, len), DMA_TO_DEVICE); + } + pending_packet->num_bufs = 0; +} + +/* Completion types and expected behavior: + * No Miss compl + Packet compl = Packet completed normally. + * Miss compl + Re-inject compl = Packet completed normally. + * No Miss compl + Re-inject compl = Skipped i.e. packet not completed. + * Miss compl + Packet compl = Skipped i.e. packet not completed. + */ +static void gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, bool is_napi, + u16 compl_tag, u64 *bytes, u64 *pkts, + bool is_reinjection) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + + if (unlikely(is_reinjection)) { + if (unlikely(pending_packet->state == + GVE_PACKET_STATE_TIMED_OUT_COMPL)) { + net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n", + priv->dev->name, (int)compl_tag); + /* Packet was already completed as a result of timeout, + * so just remove from list and free pending packet. + */ + remove_from_list(tx, + &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + return; + } + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { + /* No outstanding miss completion but packet allocated + * implies packet receives a re-injection completion + * without a a prior miss completion. Return without + * completing the packet. + */ + net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + } else { + /* Packet is allocated but not a pending data completion. */ + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: No pending data completion: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + } + gve_unmap_packet(tx->dev, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; + napi_consume_skb(pending_packet->skb, is_napi); + pending_packet->skb = NULL; + gve_free_pending_packet(tx, pending_packet); +} + +static void gve_handle_miss_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, u16 compl_tag, + u64 *bytes, u64 *pkts) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + + if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) { + net_err_ratelimited("%s: Invalid TX completion tag: %d\n", + priv->dev->name, (int)compl_tag); + return; + } + + pending_packet = &tx->dqo.pending_packets[compl_tag]; + if (unlikely(pending_packet->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n", + priv->dev->name, (int)pending_packet->state, + (int)compl_tag); + return; + } + + pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL; + /* jiffies can wraparound but time comparisons can handle overflows. */ + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT * + MSEC_PER_SEC); + add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet); + + *bytes += pending_packet->skb->len; + (*pkts)++; +} + +static void remove_miss_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.miss_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.miss_completions, + pending_packet); + /* Unmap buffers and free skb but do not unallocate packet i.e. + * the completion tag is not freed to ensure that the driver + * can take appropriate action if a corresponding valid + * completion is received later. + */ + gve_unmap_packet(tx->dev, pending_packet); + /* This indicates the packet was dropped. */ + dev_kfree_skb_any(pending_packet->skb); + pending_packet->skb = NULL; + tx->dropped_pkt++; + net_err_ratelimited("%s: No reinjection completion was received for: %d.\n", + priv->dev->name, + (int)(pending_packet - tx->dqo.pending_packets)); + + pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL; + pending_packet->timeout_jiffies = + jiffies + + msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT * + MSEC_PER_SEC); + /* Maintain pending packet in another list so the packet can be + * unallocated at a later time. + */ + add_to_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + } +} + +static void remove_timed_out_completions(struct gve_priv *priv, + struct gve_tx_ring *tx) +{ + struct gve_tx_pending_packet_dqo *pending_packet; + s16 next_index; + + next_index = tx->dqo_compl.timed_out_completions.head; + while (next_index != -1) { + pending_packet = &tx->dqo.pending_packets[next_index]; + next_index = pending_packet->next; + /* Break early because packets should timeout in order. */ + if (time_is_after_jiffies(pending_packet->timeout_jiffies)) + break; + + remove_from_list(tx, &tx->dqo_compl.timed_out_completions, + pending_packet); + gve_free_pending_packet(tx, pending_packet); + } +} + +int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, + struct napi_struct *napi) +{ + u64 reinject_compl_bytes = 0; + u64 reinject_compl_pkts = 0; + int num_descs_cleaned = 0; + u64 miss_compl_bytes = 0; + u64 miss_compl_pkts = 0; + u64 pkt_compl_bytes = 0; + u64 pkt_compl_pkts = 0; + + /* Limit in order to avoid blocking for too long */ + while (!napi || pkt_compl_pkts < napi->weight) { + struct gve_tx_compl_desc *compl_desc = + &tx->dqo.compl_ring[tx->dqo_compl.head]; + u16 type; + + if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) + break; + + /* Prefetch the next descriptor. */ + prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) & + tx->dqo.complq_mask]); + + /* Do not read data until we own the descriptor */ + dma_rmb(); + type = compl_desc->type; + + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + u16 tx_head = le16_to_cpu(compl_desc->tx_head); + + atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &pkt_compl_bytes, + &pkt_compl_pkts, + /*is_reinjection=*/false); + } else if (type == GVE_COMPL_TYPE_DQO_MISS) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_miss_completion(priv, tx, compl_tag, + &miss_compl_bytes, + &miss_compl_pkts); + } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) { + u16 compl_tag = le16_to_cpu(compl_desc->completion_tag); + + gve_handle_packet_completion(priv, tx, !!napi, + compl_tag, + &reinject_compl_bytes, + &reinject_compl_pkts, + /*is_reinjection=*/true); + } + + tx->dqo_compl.head = + (tx->dqo_compl.head + 1) & tx->dqo.complq_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0; + num_descs_cleaned++; + } + + netdev_tx_completed_queue(tx->netdev_txq, + pkt_compl_pkts + miss_compl_pkts, + pkt_compl_bytes + miss_compl_bytes); + + remove_miss_completions(priv, tx); + remove_timed_out_completions(priv, tx); + + u64_stats_update_begin(&tx->statss); + tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes; + tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts; + u64_stats_update_end(&tx->statss); + return num_descs_cleaned; +} + +bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean) +{ + struct gve_tx_compl_desc *compl_desc; + struct gve_tx_ring *tx = block->tx; + struct gve_priv *priv = block->priv; + + if (do_clean) { + int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx, + &block->napi); + + /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */ + mb(); + + if (netif_tx_queue_stopped(tx->netdev_txq) && + num_descs_cleaned > 0) { + tx->wake_queue++; + netif_tx_wake_queue(tx->netdev_txq); + } + } + + /* Return true if we still have work. */ + compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; + return compl_desc->generation != tx->dqo_compl.cur_gen_bit; +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.c b/drivers/net/ethernet/google/gve/gve_utils.c new file mode 100644 index 000000000000..93f3dcbeeea9 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#include "gve.h" +#include "gve_adminq.h" +#include "gve_utils.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_tx_idx_to_ntfy(priv, queue_idx)]; + + block->tx = NULL; +} + +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + int ntfy_idx = gve_tx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_tx_ring *tx = &priv->tx[queue_idx]; + + block->tx = tx; + tx->ntfy_id = ntfy_idx; +} + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx) +{ + struct gve_notify_block *block = + &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_idx)]; + + block->rx = NULL; +} + +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx) +{ + u32 ntfy_idx = gve_rx_idx_to_ntfy(priv, queue_idx); + struct gve_notify_block *block = &priv->ntfy_blocks[ntfy_idx]; + struct gve_rx_ring *rx = &priv->rx[queue_idx]; + + block->rx = rx; + rx->ntfy_id = ntfy_idx; +} + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, u16 len, + u16 pad) +{ + struct sk_buff *skb = napi_alloc_skb(napi, len); + void *va = page_info->page_address + pad + + page_info->page_offset; + + if (unlikely(!skb)) + return NULL; + + __skb_put(skb, len); + + skb_copy_to_linear_data(skb, va, len); + + skb->protocol = eth_type_trans(skb, dev); + + return skb; +} + +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info) +{ + page_info->pagecnt_bias--; + if (page_info->pagecnt_bias == 0) { + int pagecount = page_count(page_info->page); + + /* If we have run out of bias - set it back up to INT_MAX + * minus the existing refs. + */ + page_info->pagecnt_bias = INT_MAX - pagecount; + + /* Set pagecount back up to max. */ + page_ref_add(page_info->page, INT_MAX - pagecount); + } +} diff --git a/drivers/net/ethernet/google/gve/gve_utils.h b/drivers/net/ethernet/google/gve/gve_utils.h new file mode 100644 index 000000000000..79595940b351 --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_utils.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) + * Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2021 Google, Inc. + */ + +#ifndef _GVE_UTILS_H +#define _GVE_UTILS_H + +#include <linux/etherdevice.h> + +#include "gve.h" + +void gve_tx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_tx_add_to_block(struct gve_priv *priv, int queue_idx); + +void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx); +void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx); + +struct sk_buff *gve_rx_copy(struct net_device *dev, struct napi_struct *napi, + struct gve_rx_slot_page_info *page_info, u16 len, + u16 pad); + +/* Decrement pagecnt_bias. Set it back to INT_MAX if it reached zero. */ +void gve_dec_pagecnt_bias(struct gve_rx_slot_page_info *page_info); + +#endif /* _GVE_UTILS_H */ + |