diff options
51 files changed, 971 insertions, 379 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index fabca5e51e3d..46d06678dfbe 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -11,6 +11,7 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> @@ -20,6 +21,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> @@ -168,6 +170,9 @@ static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); +static struct rb_root id_table = RB_ROOT; +/* Serialize operations of id_table tree */ +static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; @@ -202,6 +207,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) } } +struct id_table_entry { + struct list_head id_list; + struct rb_node rb_node; +}; + struct cma_device { struct list_head list; struct ib_device *device; @@ -420,11 +430,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) return hdr->ip_version >> 4; } -static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } +static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; +} + static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; @@ -445,6 +465,117 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) return (in_dev) ? 0 : -ENODEV; } +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, + struct id_table_entry *entry_b) +{ + struct rdma_id_private *id_priv = list_first_entry( + &entry_b->id_list, struct rdma_id_private, id_list_entry); + int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; + struct sockaddr *sb = cma_dst_addr(id_priv); + + if (ifindex_a != ifindex_b) + return (ifindex_a > ifindex_b) ? 1 : -1; + + if (sa->sa_family != sb->sa_family) + return sa->sa_family - sb->sa_family; + + if (sa->sa_family == AF_INET) + return memcmp((char *)&((struct sockaddr_in *)sa)->sin_addr, + (char *)&((struct sockaddr_in *)sb)->sin_addr, + sizeof(((struct sockaddr_in *)sa)->sin_addr)); + + return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, + &((struct sockaddr_in6 *)sb)->sin6_addr); +} + +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) +{ + struct rb_node **new, *parent = NULL; + struct id_table_entry *this, *node; + unsigned long flags; + int result; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + spin_lock_irqsave(&id_table_lock, flags); + new = &id_table.rb_node; + while (*new) { + this = container_of(*new, struct id_table_entry, rb_node); + result = compare_netdev_and_ip( + node_id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(node_id_priv), this); + + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else { + list_add_tail(&node_id_priv->id_list_entry, + &this->id_list); + kfree(node); + goto unlock; + } + } + + INIT_LIST_HEAD(&node->id_list); + list_add_tail(&node_id_priv->id_list_entry, &node->id_list); + + rb_link_node(&node->rb_node, parent, new); + rb_insert_color(&node->rb_node, &id_table); + +unlock: + spin_unlock_irqrestore(&id_table_lock, flags); + return 0; +} + +static struct id_table_entry * +node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) +{ + struct rb_node *node = root->rb_node; + struct id_table_entry *data; + int result; + + while (node) { + data = container_of(node, struct id_table_entry, rb_node); + result = compare_netdev_and_ip(ifindex, sa, data); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) +{ + struct id_table_entry *data; + unsigned long flags; + + spin_lock_irqsave(&id_table_lock, flags); + if (list_empty(&id_priv->id_list_entry)) + goto out; + + data = node_from_ndev_ip(&id_table, + id_priv->id.route.addr.dev_addr.bound_dev_if, + cma_dst_addr(id_priv)); + if (!data) + goto out; + + list_del_init(&id_priv->id_list_entry); + if (list_empty(&data->id_list)) { + rb_erase(&data->rb_node, &id_table); + kfree(data); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); +} + static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -481,16 +612,6 @@ static void cma_release_dev(struct rdma_id_private *id_priv) mutex_unlock(&lock); } -static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.src_addr; -} - -static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) -{ - return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; -} - static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; @@ -861,6 +982,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); + INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); @@ -1883,6 +2005,7 @@ static void _destroy_id(struct rdma_id_private *id_priv, cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); + cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) @@ -3172,8 +3295,11 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); - else if (rdma_protocol_roce(id->device, id->port_num)) + else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); + if (!ret) + cma_add_id_to_tree(id_priv); + } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else @@ -4922,10 +5048,87 @@ out: return ret; } +static void cma_netevent_work_handler(struct work_struct *_work) +{ + struct rdma_id_private *id_priv = + container_of(_work, struct rdma_id_private, id.net_work); + struct rdma_cm_event event = {}; + + mutex_lock(&id_priv->handler_mutex); + + if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || + READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) + goto out_unlock; + + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = -ETIMEDOUT; + + if (cma_cm_event_handler(id_priv, &event)) { + __acquire(&id_priv->handler_mutex); + id_priv->cm_id.ib = NULL; + cma_id_put(id_priv); + destroy_id_handler_unlock(id_priv); + return; + } + +out_unlock: + mutex_unlock(&id_priv->handler_mutex); + cma_id_put(id_priv); +} + +static int cma_netevent_callback(struct notifier_block *self, + unsigned long event, void *ctx) +{ + struct id_table_entry *ips_node = NULL; + struct rdma_id_private *current_id; + struct neighbour *neigh = ctx; + unsigned long flags; + + if (event != NETEVENT_NEIGH_UPDATE) + return NOTIFY_DONE; + + spin_lock_irqsave(&id_table_lock, flags); + if (neigh->tbl->family == AF_INET6) { + struct sockaddr_in6 neigh_sock_6; + + neigh_sock_6.sin6_family = AF_INET6; + neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_6); + } else if (neigh->tbl->family == AF_INET) { + struct sockaddr_in neigh_sock_4; + + neigh_sock_4.sin_family = AF_INET; + neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); + ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, + (struct sockaddr *)&neigh_sock_4); + } else + goto out; + + if (!ips_node) + goto out; + + list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { + if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, + neigh->ha, ETH_ALEN)) + continue; + INIT_WORK(¤t_id->id.net_work, cma_netevent_work_handler); + cma_id_get(current_id); + queue_work(cma_wq, ¤t_id->id.net_work); + } +out: + spin_unlock_irqrestore(&id_table_lock, flags); + return NOTIFY_DONE; +} + static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; +static struct notifier_block cma_netevent_cb = { + .notifier_call = cma_netevent_callback +}; + static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; @@ -5148,6 +5351,7 @@ static int __init cma_init(void) ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); + register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) @@ -5162,6 +5366,7 @@ static int __init cma_init(void) err_ib: ib_unregister_client(&cma_client); err: + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); @@ -5174,6 +5379,7 @@ static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); + unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 757a0ef79872..b7354c94cf1b 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -64,6 +64,7 @@ struct rdma_id_private { struct list_head listen_item; struct list_head listen_list; }; + struct list_head id_list_entry; struct cma_device *cma_dev; struct list_head mc_list; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 94d83b665a2f..29b1ab1d5f93 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -68,7 +68,7 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, * In exclusive access mode, we check that the counter is zero (nobody * claimed this object) and we set it to -1. Releasing a shared access * lock is done simply by decreasing the counter. As for exclusive - * access locks, since only a single one of them is is allowed + * access locks, since only a single one of them is allowed * concurrently, setting the counter to zero is enough for releasing * this lock. */ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 79401e6c6aa9..785c37cae3c0 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -173,7 +173,7 @@ struct bnxt_re_dev { /* Max of 2 lossless traffic class supported per port */ u16 cosq[2]; - /* QP for for handling QP1 packets */ + /* QP for handling QP1 packets */ struct bnxt_re_gsi_context gsi_ctx; struct bnxt_re_stats stats; atomic_t nq_alloc_cnt; diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index 6eb739052121..14b92e12bf29 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config INFINIBAND_HFI1 tristate "Cornelis OPX Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT && I2C + depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML select MMU_NOTIFIER select CRC32 select I2C_ALGOBIT diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2e4cf2b11653..629beff053ad 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1179,8 +1179,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, goto done; ret = init_user_ctxt(fd, uctxt); - if (ret) + if (ret) { + hfi1_free_ctxt_rcv_groups(uctxt); goto done; + } user_init(uctxt); diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index d6bbdb8fcb50..5d9a7b09ca37 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -742,9 +742,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) kzalloc_node(sizeof(*tx->sdma_hdr), GFP_KERNEL, priv->dd->node); - netif_tx_napi_add(dev, &txq->napi, - hfi1_ipoib_poll_tx_ring, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring); } return 0; diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 03b098a494b5..3dfa5aff2512 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) * right now. */ set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); - netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); rc = msix_netdev_request_rcd_irq(rxq->rcd); if (rc) goto bail_context_irq_failure; diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c index 136f9a99e1e0..7690f996d5e3 100644 --- a/drivers/infiniband/hw/hfi1/pio_copy.c +++ b/drivers/infiniband/hw/hfi1/pio_copy.c @@ -172,7 +172,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n) } /* - * Read nbytes from "from" and and place them in the low bytes + * Read nbytes from "from" and place them in the low bytes * of pbuf->carry. Other bytes are left as-is. Any previous * value in pbuf->carry is lost. * diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2855e9ad4b32..f848eedc6a23 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -959,6 +959,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct work_struct ecc_work; const struct hns_roce_dfx_hw *dfx; u32 func_num; u32 is_vf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba3c742258ef..cbdafaac678a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -55,6 +55,42 @@ enum { CMD_RST_PRC_EBUSY, }; +enum ecc_resource_type { + ECC_RESOURCE_QPC, + ECC_RESOURCE_CQC, + ECC_RESOURCE_MPT, + ECC_RESOURCE_SRQC, + ECC_RESOURCE_GMV, + ECC_RESOURCE_QPC_TIMER, + ECC_RESOURCE_CQC_TIMER, + ECC_RESOURCE_SCCC, + ECC_RESOURCE_COUNT, +}; + +static const struct { + const char *name; + u8 read_bt0_op; + u8 write_bt0_op; +} fmea_ram_res[] = { + { "ECC_RESOURCE_QPC", + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, + { "ECC_RESOURCE_CQC", + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, + { "ECC_RESOURCE_MPT", + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, + { "ECC_RESOURCE_SRQC", + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ + { "ECC_RESOURCE_GMV", + 0, 0 }, + { "ECC_RESOURCE_QPC_TIMER", + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, + { "ECC_RESOURCE_CQC_TIMER", + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, + { "ECC_RESOURCE_SCCC", + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, +}; + static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, struct ib_sge *sg) { @@ -5855,12 +5891,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? aeqe : NULL; } -static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); - int aeqe_found = 0; + irqreturn_t aeqe_found = IRQ_NONE; int event_type; u32 queue_num; int sub_type; @@ -5914,7 +5950,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->event_type = event_type; eq->sub_type = sub_type; ++eq->cons_index; - aeqe_found = 1; + aeqe_found = IRQ_HANDLED; hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); @@ -5922,7 +5958,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, } update_eq_db(eq); - return aeqe_found; + + return IRQ_RETVAL(aeqe_found); } static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) @@ -5937,11 +5974,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - int ceqe_found = 0; + irqreturn_t ceqe_found = IRQ_NONE; u32 cqn; while (ceqe) { @@ -5955,21 +5992,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, hns_roce_cq_completion(hr_dev, cqn); ++eq->cons_index; - ceqe_found = 1; + ceqe_found = IRQ_HANDLED; ceqe = next_ceqe_sw_v2(eq); } update_eq_db(eq); - return ceqe_found; + return IRQ_RETVAL(ceqe_found); } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) { struct hns_roce_eq *eq = eq_ptr; struct hns_roce_dev *hr_dev = eq->hr_dev; - int int_work; + irqreturn_t int_work; if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ @@ -5981,27 +6018,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) return IRQ_RETVAL(int_work); } -static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, + u32 int_st) { - struct hns_roce_dev *hr_dev = dev_id; - struct device *dev = hr_dev->dev; - int int_work = 0; - u32 int_st; + struct pci_dev *pdev = hr_dev->pci_dev; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + const struct hnae3_ae_ops *ops = ae_dev->ops; + irqreturn_t int_work = IRQ_NONE; u32 int_en; - /* Abnormal interrupt */ - int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { - struct pci_dev *pdev = hr_dev->pci_dev; - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); - const struct hnae3_ae_ops *ops = ae_dev->ops; + dev_err(hr_dev->dev, "AEQ overflow!\n"); - dev_err(dev, "AEQ overflow!\n"); - - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, + 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); /* Set reset level for reset_event() */ if (ops->set_default_reset_request) @@ -6013,19 +6045,165 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); - int_work = 1; - } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) { - dev_err(dev, "RAS interrupt!\n"); + int_work = IRQ_HANDLED; + } else { + dev_err(hr_dev->dev, "there is no basic abn irq found.\n"); + } + + return IRQ_RETVAL(int_work); +} - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); +static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + int ret; - int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); + + return 0; +} + +static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 addr_upper; + u32 addr_low; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read gmv, ret = %d.\n", ret); + return ret; + } + + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + +static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) +{ + if (res_type == ECC_RESOURCE_QPC_TIMER || + res_type == ECC_RESOURCE_CQC_TIMER || + res_type == ECC_RESOURCE_SCCC) + return le64_to_cpu(*data); + + return le64_to_cpu(*data) << PAGE_SHIFT; +} - int_work = 1; +static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, + u32 index) +{ + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; + struct hns_roce_cmd_mailbox *mailbox; + u64 addr; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read fmea ram, ret = %d.\n", + ret); + goto out; + } + + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); + if (ret) + dev_err(hr_dev->dev, + "failed to execute cmd to write fmea ram, ret = %d.\n", + ret); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + u32 res_type = ecc_info->res_type; + u32 index = ecc_info->index; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); + + if (res_type >= ECC_RESOURCE_COUNT) { + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", + res_type); + return; + } + + if (res_type == ECC_RESOURCE_GMV) + ret = fmea_recover_gmv(hr_dev, index); + else + ret = fmea_recover_others(hr_dev, res_type, index); + if (ret) + dev_err(hr_dev->dev, + "failed to recover %s, index = %u, ret = %d.\n", + fmea_ram_res[res_type].name, index, ret); +} + +static void fmea_ram_ecc_work(struct work_struct *ecc_work) +{ + struct hns_roce_dev *hr_dev = + container_of(ecc_work, struct hns_roce_dev, ecc_work); + struct fmea_ram_ecc ecc_info = {}; + + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); + return; + } + + if (!ecc_info.is_ecc_err) { + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); + return; + } + + fmea_ram_ecc_recover(hr_dev, &ecc_info); +} + +static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + irqreturn_t int_work = IRQ_NONE; + u32 int_st; + + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + + if (int_st) { + int_work = abnormal_interrupt_basic(hr_dev, int_st); + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); + int_work = IRQ_HANDLED; } else { - dev_err(dev, "There is no abnormal irq found!\n"); + dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); } return IRQ_RETVAL(int_work); @@ -6342,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } } + INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 7ffb7824d268..f96debac30fe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -250,6 +250,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_OPC_EXT_CFG = 0x8512, + HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, }; @@ -1107,6 +1108,11 @@ enum { #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64) +/* Fields of HNS_ROCE_QUERY_RAM_ECC */ +#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) +#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) +#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) + struct hns_roce_cfg_sgid_tb { __le32 table_idx_rsv; __le32 vf_sgid_l; @@ -1343,6 +1349,12 @@ struct hns_roce_dip { struct list_head node; /* all dips are on a list */ }; +struct fmea_ram_ecc { + u32 is_ecc_err; + u32 res_type; + u32 index; +}; + /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 #define HNS_ROCE_MAX_CQ_PERIOD 65 @@ -1382,7 +1394,6 @@ struct hns_roce_dip { #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 -#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1 #define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1 diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 638bf4a1ed94..e4d837f82d75 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1477,12 +1477,13 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, list_for_each_entry (listen_node, &cm_core->listen_list, list) { memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); listen_port = listen_node->loc_port; + if (listen_port != dst_port || + !(listener_state & listen_node->listener_state)) + continue; /* compare node pair, return node handle if a match */ - if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) || - !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) && - listen_port == dst_port && - vlan_id == listen_node->vlan_id && - (listener_state & listen_node->listener_state)) { + if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) || + (!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) && + vlan_id == listen_node->vlan_id)) { refcount_inc(&listen_node->refcnt); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 58c0e181ca2b..a41e0d21143a 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -4872,10 +4872,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) sd_diff = sd_needed - hmc_fpm_misc->max_sds; if (sd_diff > 128) { - if (qpwanted > 128 && sd_diff > 144) + if (!(loop_count % 2) && qpwanted > 128) { qpwanted /= 2; - mrwanted /= 2; - pblewanted /= 2; + } else { + mrwanted /= 2; + pblewanted /= 2; + } continue; } if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF && diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index dd3943d22dc6..4f132c6fb653 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -257,10 +257,6 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) iwqp->last_aeq = info->ae_id; spin_unlock_irqrestore(&iwqp->lock, flags); ctx_info = &iwqp->ctx_info; - if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = true; - else - ctx_info->iwarp_info->err_rq_idx_valid = true; } else { if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR) continue; @@ -370,16 +366,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: case IRDMA_AE_LCE_CQ_CATASTROPHIC: case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: - if (rdma_protocol_roce(&iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = false; - else - ctx_info->iwarp_info->err_rq_idx_valid = false; - fallthrough; default: - ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n", - info->ae_id, info->qp, info->qp_cq_id); + ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n", + info->ae_id, info->qp, info->qp_cq_id, info->ae_src); if (rdma_protocol_roce(&iwdev->ibdev, 1)) { - if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) { + ctx_info->roce_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->roce_info->err_rq_idx = info->wqe_idx; irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va, ctx_info); @@ -388,7 +380,8 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) irdma_cm_disconn(iwqp); break; } - if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) { + ctx_info->iwarp_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->iwarp_info->err_rq_idx = info->wqe_idx; ctx_info->tcp_info_valid = false; ctx_info->iwarp_info_valid = true; @@ -1512,10 +1505,7 @@ static int irdma_hmc_setup(struct irdma_pci_f *rf) int status; u32 qpcnt; - if (rf->rdma_ver == IRDMA_GEN_1) - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2; - else - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; + qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; rf->sd_type = IRDMA_SD_TYPE_DIRECT; status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt); @@ -1543,7 +1533,7 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf) rf->obj_mem.pa); rf->obj_mem.va = NULL; if (rf->rdma_ver != IRDMA_GEN_1) { - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; } kfree(rf->ceqlist); @@ -1972,9 +1962,8 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) u32 ret; if (rf->rdma_ver != IRDMA_GEN_1) { - rf->allocated_ws_nodes = - kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES), - sizeof(unsigned long), GFP_KERNEL); + rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES, + GFP_KERNEL); if (!rf->allocated_ws_nodes) return -ENOMEM; @@ -2023,7 +2012,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) return 0; mem_rsrc_kzalloc_fail: - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; return ret; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index ef862bced20f..65e966ad3453 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -85,7 +85,7 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_NO_QSET 0xffff #define IW_CFG_FPM_QP_COUNT 32768 -#define IRDMA_MAX_PAGES_PER_FMR 512 +#define IRDMA_MAX_PAGES_PER_FMR 262144 #define IRDMA_MIN_PAGES_PER_FMR 1 #define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED 2 #define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED 3 diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index ab3c5208a123..fdf4cc88cb91 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -652,6 +652,7 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = { }; static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { + {0xffff, 0x8002, "Invalid State"}, {0xffff, 0x8006, "Flush No Wqe Pending"}, {0xffff, 0x8007, "Modify QP Bad Close"}, {0xffff, 0x8009, "LLP Closed"}, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index c4412ece5a6d..d78b11a41b6b 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1776,11 +1776,11 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) spin_unlock_irqrestore(&iwcq->lock, flags); irdma_cq_wq_destroy(iwdev->rf, cq); - irdma_cq_free_rsrc(iwdev->rf, iwcq); spin_lock_irqsave(&iwceq->ce_lock, flags); irdma_sc_cleanup_ceqes(cq, ceq); spin_unlock_irqrestore(&iwceq->ce_lock, flags); + irdma_cq_free_rsrc(iwdev->rf, iwcq); return 0; } @@ -2605,7 +2605,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, palloc = &iwpbl->pble_alloc; iwmr->page_cnt = max_num_sg; err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, - true); + false); if (err_code) goto err_get_pble; @@ -2641,8 +2641,16 @@ static int irdma_set_page(struct ib_mr *ibmr, u64 addr) if (unlikely(iwmr->npages == iwmr->page_cnt)) return -ENOMEM; - pbl = palloc->level1.addr; - pbl[iwmr->npages++] = addr; + if (palloc->level == PBLE_LEVEL_2) { + struct irdma_pble_info *palloc_info = + palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT); + + palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr; + } else { + pbl = palloc->level1.addr; + pbl[iwmr->npages] = addr; + } + iwmr->npages++; return 0; } diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 08371a80fdc2..be189e0525de 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -523,6 +523,10 @@ repoll: "Requestor" : "Responder", cq->mcq.cqn); mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", err_cqe->syndrome, err_cqe->vendor_err_synd); + if (wc->status != IB_WC_WR_FLUSH_ERR && + (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) + dev->umrc.state = MLX5_UMR_STATE_RECOVER; + if (opcode == MLX5_CQE_REQ_ERR) { wq = &(*cur_qp)->sq; wqe_ctr = be16_to_cpu(cqe64->wqe_counter); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 998b67509a53..7460e0dfe6db 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -717,13 +717,23 @@ struct mlx5_ib_umr_context { struct completion done; }; +enum { + MLX5_UMR_STATE_ACTIVE, + MLX5_UMR_STATE_RECOVER, + MLX5_UMR_STATE_ERR, +}; + struct umr_common { struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - /* control access to UMR QP + /* Protects from UMR QP overflow */ struct semaphore sem; + /* Protects from using UMR while the UMR is not active + */ + struct mutex lock; + unsigned int state; }; struct mlx5_cache_ent { diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 3a48364c0918..e00b94d1b1ea 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); + mutex_init(&dev->umrc.lock); return 0; @@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) ib_dealloc_pd(dev->umrc.pd); } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) + goto err; + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) { @@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, id.ib_cqe = cqe; mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, - MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); + MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); mlx5r_ring_db(qp, 1, ctrl); @@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5r_umr_init_context(&umr_context); down(&umrc->sem); - err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, - with_data); - if (err) - mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); - else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); + while (true) { + mutex_lock(&umrc->lock); + if (umrc->state == MLX5_UMR_STATE_ERR) { + mutex_unlock(&umrc->lock); err = -EFAULT; + break; + } + + if (umrc->state == MLX5_UMR_STATE_RECOVER) { + mutex_unlock(&umrc->lock); + usleep_range(3000, 5000); + continue; + } + + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, + with_data); + mutex_unlock(&umrc->lock); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", + err); + break; } + + wait_for_completion(&umr_context.done); + + if (umr_context.status == IB_WC_SUCCESS) + break; + + if (umr_context.status == IB_WC_WR_FLUSH_ERR) + continue; + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", + umr_context.status); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", + err); + err = -EFAULT; + break; } up(&umrc->sem); return err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f0f43b6db89e..3ecb472c9217 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3082,7 +3082,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, else DP_ERR(dev, "roce alloc tid returned error %d\n", rc); - goto err0; + goto err1; } /* Index only, 18 bit long, lkey = itid << 8 | key */ @@ -3106,7 +3106,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { DP_ERR(dev, "roce register tid returned an error %d\n", rc); - goto err1; + goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; @@ -3115,8 +3115,10 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey); return mr; -err1: +err2: dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); +err1: + qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); err0: kfree(mr); return ERR_PTR(rc); diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index aa290928cf96..3937144b2ae5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -153,7 +153,7 @@ static int qib_get_base_info(struct file *fp, void __user *ubase, kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; /* * for this use, may be cfgctxts summed over all chips that - * are are configured and present + * are configured and present */ kinfo->spi_nctxts = dd->cfgctxts; /* unit (chip/board) our context is on */ @@ -851,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, ret = -EPERM; goto bail; } - /* don't allow them to later change to writeable with mprotect */ + /* don't allow them to later change to writable with mprotect */ vma->vm_flags &= ~VM_MAYWRITE; start = vma->vm_start; @@ -941,7 +941,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, goto bail; } /* - * Don't allow permission to later change to writeable + * Don't allow permission to later change to writable * with mprotect. */ vma->vm_flags &= ~VM_MAYWRITE; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index ceed302cf6a0..6861c6384f18 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2850,9 +2850,9 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) qib_7322_free_irq(dd); kfree(dd->cspec->cntrs); - kfree(dd->cspec->sendchkenable); - kfree(dd->cspec->sendgrhchk); - kfree(dd->cspec->sendibchk); + bitmap_free(dd->cspec->sendchkenable); + bitmap_free(dd->cspec->sendgrhchk); + bitmap_free(dd->cspec->sendibchk); kfree(dd->cspec->msix_entries); for (i = 0; i < dd->num_pports; i++) { unsigned long flags; @@ -6383,18 +6383,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd) features = qib_7322_boardname(dd); /* now that piobcnt2k and 4k set, we can allocate these */ - sbufcnt = dd->piobcnt2k + dd->piobcnt4k + - NUM_VL15_BUFS + BITS_PER_LONG - 1; - sbufcnt /= BITS_PER_LONG; - dd->cspec->sendchkenable = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable), - GFP_KERNEL); - dd->cspec->sendgrhchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk), - GFP_KERNEL); - dd->cspec->sendibchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk), - GFP_KERNEL); + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + + dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || !dd->cspec->sendibchk) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index d1a72e89e297..45211008449f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1106,8 +1106,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) if (!qib_cpulist_count) { u32 count = num_online_cpus(); - qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), - GFP_KERNEL); + qib_cpulist = bitmap_zalloc(count, GFP_KERNEL); if (qib_cpulist) qib_cpulist_count = count; } @@ -1279,7 +1278,7 @@ static void __exit qib_ib_cleanup(void) #endif qib_cpulist_count = 0; - kfree(qib_cpulist); + bitmap_free(qib_cpulist); WARN_ON(!xa_empty(&qib_dev_table)); qib_dev_cleanup(); diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 81b810d006c0..1dc3ccf0cf1f 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -587,7 +587,7 @@ static int epb_access(struct qib_devdata *dd, int sdnum, int claim) /* Need to release */ u64 pollval; /* - * The only writeable bits are the request and CS. + * The only writable bits are the request and CS. * Both should be clear */ u64 newval = 0; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index e212929369df..67a1b4562dc2 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -482,7 +482,7 @@ int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) if (err) goto out_free_dev; - if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { usnic_err("IOMMU of %s does not support cache coherency\n", dev_name(dev)); err = -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 642b52539ac3..b1a0ab3cd4bd 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -19,16 +19,16 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, } if (cqe > rxe->attr.max_cqe) { - pr_warn("cqe(%d) > max_cqe(%d)\n", - cqe, rxe->attr.max_cqe); + pr_debug("cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); goto err1; } if (cq) { count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); if (cqe < count) { - pr_warn("cqe(%d) < current # elements in queue (%d)", - cqe, count); + pr_debug("cqe(%d) < current # elements in queue (%d)", + cqe, count); goto err1; } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 0e022ae1b8a5..336164843db4 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -145,7 +145,7 @@ static inline int rcv_wqe_size(int max_sge) max_sge * sizeof(struct ib_sge); } -void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); +void free_rd_atomic_resource(struct resp_res *res); static inline void rxe_advance_resp_resource(struct rxe_qp *qp) { diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index fc3942e04a1f..9a5c2af6a56f 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -687,7 +687,7 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (atomic_read(&mr->num_mw) > 0) return -EINVAL; - rxe_put(mr); + rxe_cleanup(mr); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 2e1fa844fabf..bb6a1edaaee8 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -33,6 +33,8 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) RXE_MW_STATE_FREE : RXE_MW_STATE_VALID; spin_lock_init(&mw->lock); + rxe_finalize(mw); + return 0; } @@ -40,7 +42,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) { struct rxe_mw *mw = to_rmw(ibmw); - rxe_put(mw); + rxe_cleanup(mw); return 0; } @@ -113,7 +115,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { pr_err_once( - "attempt to bind an writeable MW to an MR without local write access\n"); + "attempt to bind an writable MW to an MR without local write access\n"); return -EINVAL; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 19b14826385b..f50620f5a0a1 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -6,6 +6,7 @@ #include "rxe.h" +#define RXE_POOL_TIMEOUT (200) #define RXE_POOL_ALIGN (16) static const struct rxe_type_info { @@ -136,10 +137,14 @@ void *rxe_alloc(struct rxe_pool *pool) elem->pool = pool; elem->obj = obj; kref_init(&elem->ref_cnt); + init_completion(&elem->complete); - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, + /* allocate index in array but leave pointer as NULL so it + * can't be looked up until rxe_finalize() is called + */ + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, &pool->next, GFP_KERNEL); - if (err) + if (err < 0) goto err_free; return obj; @@ -151,9 +156,11 @@ err_cnt: return NULL; } -int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable) { int err; + gfp_t gfp_flags; if (WARN_ON(pool->type == RXE_TYPE_MR)) return -EINVAL; @@ -164,10 +171,19 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) elem->pool = pool; elem->obj = (u8 *)elem - pool->elem_offset; kref_init(&elem->ref_cnt); - - err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, - &pool->next, GFP_KERNEL); - if (err) + init_completion(&elem->complete); + + /* AH objects are unique in that the create_ah verb + * can be called in atomic context. If the create_ah + * call is not sleepable use GFP_ATOMIC. + */ + gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC; + + if (sleepable) + might_sleep(); + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, + &pool->next, gfp_flags); + if (err < 0) goto err_cnt; return 0; @@ -181,16 +197,15 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) { struct rxe_pool_elem *elem; struct xarray *xa = &pool->xa; - unsigned long flags; void *obj; - xa_lock_irqsave(xa, flags); + rcu_read_lock(); elem = xa_load(xa, index); if (elem && kref_get_unless_zero(&elem->ref_cnt)) obj = elem->obj; else obj = NULL; - xa_unlock_irqrestore(xa, flags); + rcu_read_unlock(); return obj; } @@ -198,17 +213,74 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) static void rxe_elem_release(struct kref *kref) { struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt); - struct rxe_pool *pool = elem->pool; - xa_erase(&pool->xa, elem->index); + complete(&elem->complete); +} + +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) +{ + struct rxe_pool *pool = elem->pool; + struct xarray *xa = &pool->xa; + static int timeout = RXE_POOL_TIMEOUT; + int ret, err = 0; + void *xa_ret; + + if (sleepable) + might_sleep(); + + /* erase xarray entry to prevent looking up + * the pool elem from its index + */ + xa_ret = xa_erase(xa, elem->index); + WARN_ON(xa_err(xa_ret)); + + /* if this is the last call to rxe_put complete the + * object. It is safe to touch obj->elem after this since + * it is freed below + */ + __rxe_put(elem); + + /* wait until all references to the object have been + * dropped before final object specific cleanup and + * return to rdma-core + */ + if (sleepable) { + if (!completion_done(&elem->complete) && timeout) { + ret = wait_for_completion_timeout(&elem->complete, + timeout); + + /* Shouldn't happen. There are still references to + * the object but, rather than deadlock, free the + * object or pass back to rdma-core. + */ + if (WARN_ON(!ret)) + err = -EINVAL; + } + } else { + unsigned long until = jiffies + timeout; + + /* AH objects are unique in that the destroy_ah verb + * can be called in atomic context. This delay + * replaces the wait_for_completion call above + * when the destroy_ah call is not sleepable + */ + while (!completion_done(&elem->complete) && + time_before(jiffies, until)) + mdelay(1); + + if (WARN_ON(!completion_done(&elem->complete))) + err = -EINVAL; + } if (pool->cleanup) pool->cleanup(elem); if (pool->type == RXE_TYPE_MR) - kfree(elem->obj); + kfree_rcu(elem->obj); atomic_dec(&pool->num_elem); + + return err; } int __rxe_get(struct rxe_pool_elem *elem) @@ -220,3 +292,11 @@ int __rxe_put(struct rxe_pool_elem *elem) { return kref_put(&elem->ref_cnt, rxe_elem_release); } + +void __rxe_finalize(struct rxe_pool_elem *elem) +{ + void *xa_ret; + + xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL); + WARN_ON(xa_err(xa_ret)); +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 0860660d65ec..9d83cb32092f 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -24,6 +24,7 @@ struct rxe_pool_elem { void *obj; struct kref ref_cnt; struct list_head list; + struct completion complete; u32 index; }; @@ -57,21 +58,28 @@ void rxe_pool_cleanup(struct rxe_pool *pool); void *rxe_alloc(struct rxe_pool *pool); /* connect already allocated object to pool */ -int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); - -#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem) +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable); +#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true) +#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \ + &(obj)->elem, sleepable) /* lookup an indexed object from index. takes a reference on object */ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); int __rxe_get(struct rxe_pool_elem *elem); - #define rxe_get(obj) __rxe_get(&(obj)->elem) int __rxe_put(struct rxe_pool_elem *elem); - #define rxe_put(obj) __rxe_put(&(obj)->elem) +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable); +#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true) +#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable) + #define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt) +void __rxe_finalize(struct rxe_pool_elem *elem); +#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem) + #endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 22e9b85344c3..65d75eea460f 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -120,17 +120,15 @@ static void free_rd_atomic_resources(struct rxe_qp *qp) for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; - free_rd_atomic_resource(qp, res); + free_rd_atomic_resource(res); } kfree(qp->resp.resources); qp->resp.resources = NULL; } } -void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) +void free_rd_atomic_resource(struct resp_res *res) { - if (res->type == RXE_ATOMIC_MASK) - kfree_skb(res->atomic.skb); res->type = 0; } @@ -142,7 +140,7 @@ static void cleanup_rd_atomic_resources(struct rxe_qp *qp) if (qp->resp.resources) { for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { res = &qp->resp.resources[i]; - free_rd_atomic_resource(qp, res); + free_rd_atomic_resource(res); } } } @@ -804,13 +802,15 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); - atomic_dec(&qp->scq->num_wq); - if (qp->scq) + if (qp->scq) { + atomic_dec(&qp->scq->num_wq); rxe_put(qp->scq); + } - atomic_dec(&qp->rcq->num_wq); - if (qp->rcq) + if (qp->rcq) { + atomic_dec(&qp->rcq->num_wq); rxe_put(qp->rcq); + } if (qp->pd) rxe_put(qp->pd); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index 6227112ef7a2..ed44042782fa 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -7,9 +7,6 @@ #ifndef RXE_QUEUE_H #define RXE_QUEUE_H -/* for definition of shared struct rxe_queue_buf */ -#include <uapi/rdma/rdma_user_rxe.h> - /* Implements a simple circular buffer that is shared between user * and the driver and can be resized. The requested element size is * rounded up to a power of 2 and the number of elements in the buffer @@ -53,6 +50,8 @@ enum queue_type { QUEUE_TYPE_FROM_DRIVER, }; +struct rxe_queue_buf; + struct rxe_queue { struct rxe_dev *rxe; struct rxe_queue_buf *buf; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 9d98237389cf..dec4ddaca70f 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -15,8 +15,7 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, u32 opcode); static inline void retry_first_write_send(struct rxe_qp *qp, - struct rxe_send_wqe *wqe, - unsigned int mask, int npsn) + struct rxe_send_wqe *wqe, int npsn) { int i; @@ -83,7 +82,7 @@ static void req_retry(struct rxe_qp *qp) if (mask & WR_WRITE_OR_SEND_MASK) { npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; - retry_first_write_send(qp, wqe, mask, npsn); + retry_first_write_send(qp, wqe, npsn); } if (mask & WR_READ_MASK) { @@ -581,9 +580,11 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) wqe->status = IB_WC_SUCCESS; qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); - if ((wqe->wr.send_flags & IB_SEND_SIGNALED) || - qp->sq_sig_type == IB_SIGNAL_ALL_WR) - rxe_run_task(&qp->comp.task, 1); + /* There is no ack coming for local work requests + * which can lead to a deadlock. So go ahead and complete + * it now. + */ + rxe_run_task(&qp->comp.task, 1); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index f4f6ee5d81fe..28033849d404 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -21,6 +21,7 @@ enum resp_states { RESPST_CHK_RKEY, RESPST_EXECUTE, RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, RESPST_COMPLETE, RESPST_ACKNOWLEDGE, RESPST_CLEANUP, @@ -55,6 +56,7 @@ static char *resp_state_name[] = { [RESPST_CHK_RKEY] = "CHK_RKEY", [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", @@ -448,7 +450,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { - pr_err("%s: no MW matches rkey %#x\n", __func__, rkey); + pr_debug("%s: no MW matches rkey %#x\n", + __func__, rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -468,7 +471,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { - pr_err("%s: no MR matches rkey %#x\n", __func__, rkey); + pr_debug("%s: no MR matches rkey %#x\n", + __func__, rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } @@ -549,49 +553,106 @@ out: return rc; } +static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + int type) +{ + struct resp_res *res; + u32 pkts; + + res = &qp->resp.resources[qp->resp.res_head]; + rxe_advance_resp_resource(qp); + free_rd_atomic_resource(res); + + res->type = type; + res->replay = 0; + + switch (type) { + case RXE_READ_MASK: + res->read.va = qp->resp.va + qp->resp.offset; + res->read.va_org = qp->resp.va + qp->resp.offset; + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); + res->first_psn = pkt->psn; + res->cur_psn = pkt->psn; + res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; + + res->state = rdatm_res_state_new; + break; + case RXE_ATOMIC_MASK: + res->first_psn = pkt->psn; + res->last_psn = pkt->psn; + res->cur_psn = pkt->psn; + break; + } + + return res; +} + /* Guarantee atomicity of atomic operations at the machine level. */ static DEFINE_SPINLOCK(atomic_ops_lock); -static enum resp_states process_atomic(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states atomic_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { u64 *vaddr; enum resp_states ret; struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + u64 value; - if (mr->state != RXE_MR_STATE_VALID) { - ret = RESPST_ERR_RKEY_VIOLATION; - goto out; + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); + qp->resp.res = res; } - vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, sizeof(u64)); + if (!res->replay) { + if (mr->state != RXE_MR_STATE_VALID) { + ret = RESPST_ERR_RKEY_VIOLATION; + goto out; + } - /* check vaddr is 8 bytes aligned. */ - if (!vaddr || (uintptr_t)vaddr & 7) { - ret = RESPST_ERR_MISALIGNED_ATOMIC; - goto out; - } + vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, + sizeof(u64)); - spin_lock_bh(&atomic_ops_lock); + /* check vaddr is 8 bytes aligned. */ + if (!vaddr || (uintptr_t)vaddr & 7) { + ret = RESPST_ERR_MISALIGNED_ATOMIC; + goto out; + } - qp->resp.atomic_orig = *vaddr; + spin_lock_bh(&atomic_ops_lock); + res->atomic.orig_val = value = *vaddr; - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { - if (*vaddr == atmeth_comp(pkt)) - *vaddr = atmeth_swap_add(pkt); - } else { - *vaddr += atmeth_swap_add(pkt); - } + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == atmeth_comp(pkt)) + value = atmeth_swap_add(pkt); + } else { + value += atmeth_swap_add(pkt); + } - spin_unlock_bh(&atomic_ops_lock); + *vaddr = value; + spin_unlock_bh(&atomic_ops_lock); + + qp->resp.msn++; - ret = RESPST_NONE; + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + } + + ret = RESPST_ACKNOWLEDGE; out: return ret; } static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, - struct rxe_pkt_info *pkt, struct rxe_pkt_info *ack, int opcode, int payload, @@ -629,7 +690,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, } if (ack->mask & RXE_ATMACK_MASK) - atmack_set_orig(ack, qp->resp.atomic_orig); + atmack_set_orig(ack, qp->resp.res->atomic.orig_val); err = rxe_prepare(&qp->pri_av, ack, skb); if (err) { @@ -640,34 +701,6 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, return skb; } -static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - struct resp_res *res; - u32 pkts; - - res = &qp->resp.resources[qp->resp.res_head]; - rxe_advance_resp_resource(qp); - free_rd_atomic_resource(qp, res); - - res->type = RXE_READ_MASK; - res->replay = 0; - res->read.va = qp->resp.va + qp->resp.offset; - res->read.va_org = qp->resp.va + qp->resp.offset; - res->read.resid = qp->resp.resid; - res->read.length = qp->resp.resid; - res->read.rkey = qp->resp.rkey; - - pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); - res->first_psn = pkt->psn; - res->cur_psn = pkt->psn; - res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; - - res->state = rdatm_res_state_new; - - return res; -} - /** * rxe_recheck_mr - revalidate MR from rkey and get a reference * @qp: the qp @@ -738,7 +771,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, struct rxe_mr *mr; if (!res) { - res = rxe_prepare_read_res(qp, req_pkt); + res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); qp->resp.res = res; } @@ -771,7 +804,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, payload = min_t(int, res->read.resid, mtu); - skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, + skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) return RESPST_ERR_RNR; @@ -858,9 +891,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.msn++; return RESPST_READ_REPLY; } else if (pkt->mask & RXE_ATOMIC_MASK) { - err = process_atomic(qp, pkt); - if (err) - return err; + return RESPST_ATOMIC_REPLY; } else { /* Unreachable */ WARN_ON_ONCE(1); @@ -997,14 +1028,13 @@ finish: return RESPST_CLEANUP; } -static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, - u8 syndrome, u32 psn) +static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { int err = 0; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; - skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, 0, psn, syndrome); if (!skb) { err = -ENOMEM; @@ -1019,40 +1049,29 @@ err1: return err; } -static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, - u8 syndrome) +static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { - int rc = 0; + int err = 0; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; - struct resp_res *res; - skb = prepare_ack_packet(qp, pkt, &ack_pkt, - IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, - syndrome); + skb = prepare_ack_packet(qp, &ack_pkt, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, + 0, psn, syndrome); if (!skb) { - rc = -ENOMEM; + err = -ENOMEM; goto out; } - res = &qp->resp.resources[qp->resp.res_head]; - free_rd_atomic_resource(qp, res); - rxe_advance_resp_resource(qp); - - skb_get(skb); - res->type = RXE_ATOMIC_MASK; - res->atomic.skb = skb; - res->first_psn = ack_pkt.psn; - res->last_psn = ack_pkt.psn; - res->cur_psn = ack_pkt.psn; + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) + pr_err_ratelimited("Failed sending atomic ack\n"); - rc = rxe_xmit_packet(qp, &ack_pkt, skb); - if (rc) { - pr_err_ratelimited("Failed sending ack\n"); - rxe_put(qp); - } + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; out: - return rc; + return err; } static enum resp_states acknowledge(struct rxe_qp *qp, @@ -1062,11 +1081,11 @@ static enum resp_states acknowledge(struct rxe_qp *qp, return RESPST_CLEANUP; if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) - send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); + send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) - send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); + send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) - send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); + send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); return RESPST_CLEANUP; } @@ -1119,7 +1138,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { /* SEND. Ack again and cleanup. C9-105. */ - send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn); + send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); return RESPST_CLEANUP; } else if (pkt->mask & RXE_READ_MASK) { struct resp_res *res; @@ -1173,14 +1192,11 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, /* Find the operation in our list of responder resources. */ res = find_resource(qp, pkt->psn); if (res) { - skb_get(res->atomic.skb); - /* Resend the result. */ - rc = rxe_xmit_packet(qp, pkt, res->atomic.skb); - if (rc) { - pr_err("Failed resending result. This flow is not handled - skb ignored\n"); - rc = RESPST_CLEANUP; - goto out; - } + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = RESPST_ATOMIC_REPLY; + goto out; } /* Resource not found. Class D error. Drop the request. */ @@ -1316,6 +1332,9 @@ int rxe_responder(void *arg) case RESPST_READ_REPLY: state = read_reply(qp, pkt); break; + case RESPST_ATOMIC_REPLY: + state = atomic_reply(qp, pkt); + break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; @@ -1327,7 +1346,7 @@ int rxe_responder(void *arg) break; case RESPST_ERR_PSN_OUT_OF_SEQ: /* RC only - Class B. Drop packet. */ - send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); state = RESPST_CLEANUP; break; @@ -1349,7 +1368,7 @@ int rxe_responder(void *arg) if (qp_type(qp) == IB_QPT_RC) { rxe_counter_inc(rxe, RXE_CNT_SND_RNR); /* RC - class B */ - send_ack(qp, pkt, AETH_RNR_NAK | + send_ack(qp, AETH_RNR_NAK | (~AETH_TYPE_MASK & qp->attr.min_rnr_timer), pkt->psn); @@ -1438,7 +1457,7 @@ int rxe_responder(void *arg) case RESPST_ERROR: qp->resp.goto_error = 0; - pr_warn("qp#%d moved to error state\n", qp_num(qp)); + pr_debug("qp#%d moved to error state\n", qp_num(qp)); rxe_qp_error(qp); goto exit; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9d995854a174..151c6280abd5 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -115,7 +115,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); - rxe_put(uc); + rxe_cleanup(uc); } static int rxe_port_immutable(struct ib_device *dev, u32 port_num, @@ -149,7 +149,7 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); - rxe_put(pd); + rxe_cleanup(pd); return 0; } @@ -176,7 +176,8 @@ static int rxe_create_ah(struct ib_ah *ibah, if (err) return err; - err = rxe_add_to_pool(&rxe->ah_pool, ah); + err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) return err; @@ -188,7 +189,7 @@ static int rxe_create_ah(struct ib_ah *ibah, err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { - rxe_put(ah); + rxe_cleanup(ah); return -EFAULT; } } else if (ah->is_user) { @@ -197,6 +198,8 @@ static int rxe_create_ah(struct ib_ah *ibah, } rxe_init_av(init_attr->ah_attr, &ah->av); + rxe_finalize(ah); + return 0; } @@ -228,7 +231,8 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); - rxe_put(ah); + rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); + return 0; } @@ -308,12 +312,13 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) - goto err_put; + goto err_cleanup; return 0; -err_put: - rxe_put(srq); +err_cleanup: + rxe_cleanup(srq); + return err; } @@ -362,7 +367,7 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); - rxe_put(srq); + rxe_cleanup(srq); return 0; } @@ -429,10 +434,11 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, if (err) goto qp_init; + rxe_finalize(qp); return 0; qp_init: - rxe_put(qp); + rxe_cleanup(qp); return err; } @@ -485,7 +491,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (ret) return ret; - rxe_put(qp); + rxe_cleanup(qp); return 0; } @@ -803,7 +809,7 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) rxe_cq_disable(cq); - rxe_put(cq); + rxe_cleanup(cq); return 0; } @@ -898,6 +904,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_get(pd); rxe_mr_init_dma(pd, access, mr); + rxe_finalize(mr); return &mr->ibmr; } @@ -926,11 +933,13 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, if (err) goto err3; + rxe_finalize(mr); + return &mr->ibmr; err3: rxe_put(pd); - rxe_put(mr); + rxe_cleanup(mr); err2: return ERR_PTR(err); } @@ -958,11 +967,13 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, if (err) goto err2; + rxe_finalize(mr); + return &mr->ibmr; err2: rxe_put(pd); - rxe_put(mr); + rxe_cleanup(mr); err1: return ERR_PTR(err); } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ac464e68c923..628e40c1714b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -9,7 +9,6 @@ #include <linux/interrupt.h> #include <linux/workqueue.h> -#include <rdma/rdma_user_rxe.h> #include "rxe_pool.h" #include "rxe_task.h" #include "rxe_hw_counters.h" @@ -155,7 +154,7 @@ struct resp_res { union { struct { - struct sk_buff *skb; + u64 orig_val; } atomic; struct { u64 va_org; @@ -189,7 +188,6 @@ struct rxe_resp_info { u32 resid; u32 rkey; u32 length; - u64 atomic_orig; /* SRQ only */ struct { diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 17f34d584cd9..f88d2971c2c6 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -725,11 +725,11 @@ static int siw_proc_mpareply(struct siw_cep *cep) enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); - if (rv != -EAGAIN) - siw_cancel_mpatimer(cep); if (rv) goto out_err; + siw_cancel_mpatimer(cep); + rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { @@ -895,7 +895,8 @@ static int siw_proc_mpareply(struct siw_cep *cep) } out_err: - siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + if (rv != -EAGAIN) + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 09316072b789..8dedae7ae79e 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1167,7 +1167,7 @@ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, err_out: siw_dbg(base_cq->device, "CQ creation failed: %d", rv); - if (cq && cq->queue) { + if (cq->queue) { struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2a8961b685c2..a4904371e2db 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1664,8 +1664,10 @@ static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); - netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); + netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, + IPOIB_NUM_WC); + netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, + MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index c08f2d9133b6..a00ca117303a 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -246,6 +246,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) device = ib_conn->device; ib_dev = device->ib_device; + /* +1 for drain */ if (ib_conn->pi_support) max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; else @@ -267,7 +268,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = ib_conn->cq; init_attr.recv_cq = ib_conn->cq; - init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + /* +1 for drain */ + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS + 1; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -485,7 +487,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn) iser_conn, err); /* block until all flush errors are consumed */ - ib_drain_sq(ib_conn->qp); + ib_drain_qp(ib_conn->qp); } return 1; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c index 385a19846c24..1e6ffafa2db3 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c @@ -32,11 +32,7 @@ void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con) void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats) { - struct rtrs_clt_stats_pcpu *s; - - s = get_cpu_ptr(stats->pcpu_stats); - s->rdma.failover_cnt++; - put_cpu_ptr(stats->pcpu_stats); + this_cpu_inc(stats->pcpu_stats->rdma.failover_cnt); } int rtrs_clt_stats_migration_from_cnt_to_str(struct rtrs_clt_stats *stats, char *buf) @@ -169,12 +165,8 @@ int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *s, bool enable) static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats, size_t size, int d) { - struct rtrs_clt_stats_pcpu *s; - - s = get_cpu_ptr(stats->pcpu_stats); - s->rdma.dir[d].cnt++; - s->rdma.dir[d].size_total += size; - put_cpu_ptr(stats->pcpu_stats); + this_cpu_inc(stats->pcpu_stats->rdma.dir[d].cnt); + this_cpu_add(stats->pcpu_stats->rdma.dir[d].size_total, size); } void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 9809c3883979..baecde41d126 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -740,25 +740,25 @@ struct path_it { struct rtrs_clt_path *(*next_path)(struct path_it *it); }; -/** - * list_next_or_null_rr_rcu - get next list element in round-robin fashion. +/* + * rtrs_clt_get_next_path_or_null - get clt path from the list or return NULL * @head: the head for the list. - * @ptr: the list head to take the next element from. - * @type: the type of the struct this is embedded in. - * @memb: the name of the list_head within the struct. + * @clt_path: The element to take the next clt_path from. * - * Next element returned in round-robin fashion, i.e. head will be skipped, + * Next clt path returned in round-robin fashion, i.e. head will be skipped, * but if list is observed as empty, NULL will be returned. * - * This primitive may safely run concurrently with the _rcu list-mutation + * This function may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ -#define list_next_or_null_rr_rcu(head, ptr, type, memb) \ -({ \ - list_next_or_null_rcu(head, ptr, type, memb) ?: \ - list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \ - type, memb); \ -}) +static inline struct rtrs_clt_path * +rtrs_clt_get_next_path_or_null(struct list_head *head, struct rtrs_clt_path *clt_path) +{ + return list_next_or_null_rcu(head, &clt_path->s.entry, typeof(*clt_path), s.entry) ?: + list_next_or_null_rcu(head, + READ_ONCE((&clt_path->s.entry)->next), + typeof(*clt_path), s.entry); +} /** * get_next_path_rr() - Returns path in round-robin fashion. @@ -789,10 +789,8 @@ static struct rtrs_clt_path *get_next_path_rr(struct path_it *it) path = list_first_or_null_rcu(&clt->paths_list, typeof(*path), s.entry); else - path = list_next_or_null_rr_rcu(&clt->paths_list, - &path->s.entry, - typeof(*path), - s.entry); + path = rtrs_clt_get_next_path_or_null(&clt->paths_list, path); + rcu_assign_pointer(*ppcpu_path, path); return path; @@ -1403,8 +1401,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt) unsigned int chunk_bits; int err, i; - clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth), - sizeof(long), GFP_KERNEL); + clt->permits_map = bitmap_zalloc(clt->queue_depth, GFP_KERNEL); if (!clt->permits_map) { err = -ENOMEM; goto out_err; @@ -1426,7 +1423,7 @@ static int alloc_permits(struct rtrs_clt_sess *clt) return 0; err_map: - kfree(clt->permits_map); + bitmap_free(clt->permits_map); clt->permits_map = NULL; out_err: return err; @@ -1434,13 +1431,11 @@ out_err: static void free_permits(struct rtrs_clt_sess *clt) { - if (clt->permits_map) { - size_t sz = clt->queue_depth; - + if (clt->permits_map) wait_event(clt->permits_wait, - find_first_bit(clt->permits_map, sz) >= sz); - } - kfree(clt->permits_map); + bitmap_empty(clt->permits_map, clt->queue_depth)); + + bitmap_free(clt->permits_map); clt->permits_map = NULL; kfree(clt->permits); clt->permits = NULL; @@ -2277,8 +2272,7 @@ static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_path *clt_path) * removed. If @sess is the last element, then @next is NULL. */ rcu_read_lock(); - next = list_next_or_null_rr_rcu(&clt->paths_list, &clt_path->s.entry, - typeof(*next), s.entry); + next = rtrs_clt_get_next_path_or_null(&clt->paths_list, clt_path); rcu_read_unlock(); /* diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index 9a1e5c2ae55c..ac0df734eba8 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -23,6 +23,17 @@ #define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \ __stringify(RTRS_PROTO_VER_MINOR) +/* + * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) + * and the minimum chunk size is 4096 (2^12). + * So the maximum sess_queue_depth is 65536 (2^16) in theory. + * But mempool_create, create_qp and ib_post_send fail with + * "cannot allocate memory" error if sess_queue_depth is too big. + * Therefore the pratical max value of sess_queue_depth is + * somewhere between 1 and 65534 and it depends on the system. + */ +#define MAX_SESS_QUEUE_DEPTH 65535 + enum rtrs_imm_const { MAX_IMM_TYPE_BITS = 4, MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1), @@ -46,16 +57,6 @@ enum { MAX_PATHS_NUM = 128, - /* - * Max IB immediate data size is 2^28 (MAX_IMM_PAYL_BITS) - * and the minimum chunk size is 4096 (2^12). - * So the maximum sess_queue_depth is 65536 (2^16) in theory. - * But mempool_create, create_qp and ib_post_send fail with - * "cannot allocate memory" error if sess_queue_depth is too big. - * Therefore the pratical max value of sess_queue_depth is - * somewhere between 1 and 65534 and it depends on the system. - */ - MAX_SESS_QUEUE_DEPTH = 65535, MIN_CHUNK_SIZE = 8192, RTRS_HB_INTERVAL_MS = 5000, diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c index 44b1c1652131..2aff1213a19d 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c @@ -14,9 +14,14 @@ int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) { if (enable) { - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + int cpu; + struct rtrs_srv_stats_rdma_stats *r; + + for_each_possible_cpu(cpu) { + r = per_cpu_ptr(stats->rdma_stats, cpu); + memset(r, 0, sizeof(*r)); + } - memset(r, 0, sizeof(*r)); return 0; } @@ -25,11 +30,22 @@ int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, char *page) { - struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + int cpu; + struct rtrs_srv_stats_rdma_stats sum; + struct rtrs_srv_stats_rdma_stats *r; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = per_cpu_ptr(stats->rdma_stats, cpu); + + sum.dir[READ].cnt += r->dir[READ].cnt; + sum.dir[READ].size_total += r->dir[READ].size_total; + sum.dir[WRITE].cnt += r->dir[WRITE].cnt; + sum.dir[WRITE].size_total += r->dir[WRITE].size_total; + } - return sysfs_emit(page, "%lld %lld %lld %lldn %u\n", - (s64)atomic64_read(&r->dir[READ].cnt), - (s64)atomic64_read(&r->dir[READ].size_total), - (s64)atomic64_read(&r->dir[WRITE].cnt), - (s64)atomic64_read(&r->dir[WRITE].size_total), 0); + return sysfs_emit(page, "%llu %llu %llu %llu\n", + sum.dir[READ].cnt, sum.dir[READ].size_total, + sum.dir[WRITE].cnt, sum.dir[WRITE].size_total); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index b94ae12c2795..2a3c9ac64a42 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -220,6 +220,8 @@ static void rtrs_srv_path_stats_release(struct kobject *kobj) stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats); + free_percpu(stats->rdma_stats); + kfree(stats); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 24024bce2566..34c03bde5064 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt #include <linux/module.h> -#include <linux/mempool.h> #include "rtrs-srv.h" #include "rtrs-log.h" @@ -26,11 +25,7 @@ MODULE_LICENSE("GPL"); #define DEFAULT_SESS_QUEUE_DEPTH 512 #define MAX_HDR_SIZE PAGE_SIZE -/* We guarantee to serve 10 paths at least */ -#define CHUNK_POOL_SZ 10 - static struct rtrs_rdma_dev_pd dev_pd; -static mempool_t *chunk_pool; struct class *rtrs_dev_class; static struct rtrs_srv_ib_ctx ib_ctx; @@ -1358,7 +1353,7 @@ static void free_srv(struct rtrs_srv_sess *srv) WARN_ON(refcount_read(&srv->refcount)); for (i = 0; i < srv->queue_depth; i++) - mempool_free(srv->chunks[i], chunk_pool); + __free_pages(srv->chunks[i], get_order(max_chunk_size)); kfree(srv->chunks); mutex_destroy(&srv->paths_mutex); mutex_destroy(&srv->paths_ev_mutex); @@ -1411,7 +1406,8 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx, goto err_free_srv; for (i = 0; i < srv->queue_depth; i++) { - srv->chunks[i] = mempool_alloc(chunk_pool, GFP_KERNEL); + srv->chunks[i] = alloc_pages(GFP_KERNEL, + get_order(max_chunk_size)); if (!srv->chunks[i]) goto err_free_chunks; } @@ -1424,7 +1420,7 @@ static struct rtrs_srv_sess *get_or_create_srv(struct rtrs_srv_ctx *ctx, err_free_chunks: while (i--) - mempool_free(srv->chunks[i], chunk_pool); + __free_pages(srv->chunks[i], get_order(max_chunk_size)); kfree(srv->chunks); err_free_srv: @@ -1513,6 +1509,7 @@ static void free_path(struct rtrs_srv_path *srv_path) kobject_del(&srv_path->kobj); kobject_put(&srv_path->kobj); } else { + free_percpu(srv_path->stats->rdma_stats); kfree(srv_path->stats); kfree(srv_path); } @@ -1755,13 +1752,17 @@ static struct rtrs_srv_path *__alloc_path(struct rtrs_srv_sess *srv, if (!srv_path->stats) goto err_free_sess; + srv_path->stats->rdma_stats = alloc_percpu(struct rtrs_srv_stats_rdma_stats); + if (!srv_path->stats->rdma_stats) + goto err_free_stats; + srv_path->stats->srv_path = srv_path; srv_path->dma_addr = kcalloc(srv->queue_depth, sizeof(*srv_path->dma_addr), GFP_KERNEL); if (!srv_path->dma_addr) - goto err_free_stats; + goto err_free_percpu; srv_path->s.con = kcalloc(con_num, sizeof(*srv_path->s.con), GFP_KERNEL); @@ -1813,6 +1814,8 @@ err_free_con: kfree(srv_path->s.con); err_free_dma_addr: kfree(srv_path->dma_addr); +err_free_percpu: + free_percpu(srv_path->stats->rdma_stats); err_free_stats: kfree(srv_path->stats); err_free_sess: @@ -2266,14 +2269,10 @@ static int __init rtrs_server_init(void) err); return err; } - chunk_pool = mempool_create_page_pool(sess_queue_depth * CHUNK_POOL_SZ, - get_order(max_chunk_size)); - if (!chunk_pool) - return -ENOMEM; rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server"); if (IS_ERR(rtrs_dev_class)) { err = PTR_ERR(rtrs_dev_class); - goto out_chunk_pool; + goto out_err; } rtrs_wq = alloc_workqueue("rtrs_server_wq", 0, 0); if (!rtrs_wq) { @@ -2285,9 +2284,7 @@ static int __init rtrs_server_init(void) out_dev_class: class_destroy(rtrs_dev_class); -out_chunk_pool: - mempool_destroy(chunk_pool); - +out_err: return err; } @@ -2295,7 +2292,6 @@ static void __exit rtrs_server_exit(void) { destroy_workqueue(rtrs_wq); class_destroy(rtrs_dev_class); - mempool_destroy(chunk_pool); rtrs_rdma_dev_pd_deinit(&dev_pd); } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 6292e87f6afd..186a63c217df 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -12,6 +12,7 @@ #include <linux/device.h> #include <linux/refcount.h> +#include <linux/percpu.h> #include "rtrs-pri.h" /* @@ -29,15 +30,15 @@ enum rtrs_srv_state { */ struct rtrs_srv_stats_rdma_stats { struct { - atomic64_t cnt; - atomic64_t size_total; + u64 cnt; + u64 size_total; } dir[2]; }; struct rtrs_srv_stats { - struct kobject kobj_stats; - struct rtrs_srv_stats_rdma_stats rdma_stats; - struct rtrs_srv_path *srv_path; + struct kobject kobj_stats; + struct rtrs_srv_stats_rdma_stats __percpu *rdma_stats; + struct rtrs_srv_path *srv_path; }; struct rtrs_srv_con { @@ -130,8 +131,8 @@ void close_path(struct rtrs_srv_path *srv_path); static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, size_t size, int d) { - atomic64_inc(&s->rdma_stats.dir[d].cnt); - atomic64_add(size, &s->rdma_stats.dir[d].size_total); + this_cpu_inc(s->rdma_stats->dir[d].cnt); + this_cpu_add(s->rdma_stats->dir[d].size_total, size); } /* functions which are implemented in rtrs-srv-stats.c */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index d989f030fae0..5b18e2e36ee6 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -108,6 +108,7 @@ struct rdma_cm_id { enum rdma_ucm_port_space ps; enum ib_qp_type qp_type; u32 port_num; + struct work_struct net_work; }; struct rdma_cm_id * |