diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5/odp.c')
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 454 |
1 files changed, 421 insertions, 33 deletions
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4dc6cc640ce0..01e0f6200631 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -37,6 +37,46 @@ #include "mlx5_ib.h" #include "cmd.h" +#include <linux/mlx5/eq.h> + +/* Contains the details of a pagefault. */ +struct mlx5_pagefault { + u32 bytes_committed; + u32 token; + u8 event_subtype; + u8 type; + union { + /* Initiator or send message responder pagefault details. */ + struct { + /* Received packet size, only valid for responders. */ + u32 packet_size; + /* + * Number of resource holding WQE, depends on type. + */ + u32 wq_num; + /* + * WQE index. Refers to either the send queue or + * receive queue, according to event_subtype. + */ + u16 wqe_index; + } wqe; + /* RDMA responder pagefault details */ + struct { + u32 r_key; + /* + * Received packet size, minimal size page fault + * resolution required for forward progress. + */ + u32 packet_size; + u32 rdma_op_len; + u64 rdma_va; + } rdma; + }; + + struct mlx5_ib_pf_eq *eq; + struct work_struct work; +}; + #define MAX_PREFETCH_LEN (4*1024*1024U) /* Timeout in ms to wait for an active mmu notifier to complete when handling @@ -304,14 +344,20 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, { int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; - int ret = mlx5_core_page_fault_resume(dev->mdev, - pfault->token, - wq_num, - pfault->type, - error); - if (ret) - mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", - wq_num); + u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { }; + u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { }; + int err; + + MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); + MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); + MLX5_SET(page_fault_resume_in, in, token, pfault->token); + MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); + MLX5_SET(page_fault_resume_in, in, error, !!error); + + err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + if (err) + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", + wq_num, err); } static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, @@ -503,12 +549,17 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } +#define MLX5_PF_FLAGS_PREFETCH BIT(0) +#define MLX5_PF_FLAGS_DOWNGRADE BIT(1) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt, u32 *bytes_mapped) + u64 io_virt, size_t bcnt, u32 *bytes_mapped, + u32 flags) { int npages = 0, current_seq, page_shift, ret, np; bool implicit = false; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); + bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; + bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; u64 access_mask = ODP_READ_ALLOWED_BIT; u64 start_idx, page_mask; struct ib_umem_odp *odp; @@ -532,7 +583,15 @@ next_mr: page_mask = ~(BIT(page_shift) - 1); start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; - if (mr->umem->writable) + if (prefetch && !downgrade && !mr->umem->writable) { + /* prefetch with write-access must + * be supported by the MR + */ + ret = -EINVAL; + goto out; + } + + if (mr->umem->writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; current_seq = READ_ONCE(odp->notifiers_seq); @@ -606,8 +665,8 @@ out: if (!wait_for_completion_timeout( &odp->notifier_completion, timeout)) { - mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", - current_seq, odp->notifiers_seq); + mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", + current_seq, odp->notifiers_seq, odp->notifiers_count); } } else { /* The MR is being killed, kill the QP as well. */ @@ -637,12 +696,13 @@ struct pf_frame { * -EFAULT when there's an error mapping the requested pages. The caller will * abort the page fault handling. */ -static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, - u32 key, u64 io_virt, size_t bcnt, +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key, + u64 io_virt, size_t bcnt, u32 *bytes_committed, - u32 *bytes_mapped) + u32 *bytes_mapped, u32 flags) { int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; + bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; struct pf_frame *head = NULL, *frame; struct mlx5_core_mkey *mmkey; struct mlx5_ib_mw *mw; @@ -664,6 +724,12 @@ next_mr: goto srcu_unlock; } + if (prefetch && mmkey->type != MLX5_MKEY_MR) { + mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n"); + ret = -EINVAL; + goto srcu_unlock; + } + switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); @@ -673,6 +739,11 @@ next_mr: goto srcu_unlock; } + if (prefetch && !mr->umem->is_odp) { + ret = -EINVAL; + goto srcu_unlock; + } + if (!mr->umem->is_odp) { mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", key); @@ -682,7 +753,7 @@ next_mr: goto srcu_unlock; } - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped); + ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); if (ret < 0) goto srcu_unlock; @@ -859,7 +930,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, &pfault->bytes_committed, - bytes_mapped); + bytes_mapped, 0); if (ret < 0) break; npages += ret; @@ -1025,16 +1096,31 @@ invalid_transport_or_opcode: return 0; } -static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, - u32 wq_num) +static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev, + u32 wq_num, int pf_type) { - struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); + enum mlx5_res_type res_type; - if (!mqp) { - mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); + switch (pf_type) { + case MLX5_WQE_PF_TYPE_RMP: + res_type = MLX5_RES_SRQ; + break; + case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE: + case MLX5_WQE_PF_TYPE_RESP: + case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC: + res_type = MLX5_RES_QP; + break; + default: return NULL; } + return mlx5_core_res_hold(dev->mdev, wq_num, res_type); +} + +static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res) +{ + struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res; + return to_mibqp(mqp); } @@ -1048,18 +1134,30 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, int resume_with_error = 1; u16 wqe_index = pfault->wqe.wqe_index; int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; + struct mlx5_core_rsc_common *res; struct mlx5_ib_qp *qp; + res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type); + if (!res) { + mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num); + return; + } + + switch (res->res) { + case MLX5_RES_QP: + qp = res_to_qp(res); + break; + default: + mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", pfault->type); + goto resolve_page_fault; + } + buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) { mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); goto resolve_page_fault; } - qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); - if (!qp) - goto resolve_page_fault; - ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, PAGE_SIZE, &qp->trans_qp.base); if (ret < 0) { @@ -1099,6 +1197,7 @@ resolve_page_fault: mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", pfault->wqe.wq_num, resume_with_error, pfault->type); + mlx5_core_res_put(res); free_page((unsigned long)buffer); } @@ -1142,7 +1241,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } ret = pagefault_single_data_segment(dev, rkey, address, length, - &pfault->bytes_committed, NULL); + &pfault->bytes_committed, NULL, + 0); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; @@ -1169,7 +1269,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, rkey, address, prefetch_len, - &bytes_committed, NULL); + &bytes_committed, NULL, + 0); if (ret < 0 && ret != -EAGAIN) { mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); @@ -1177,10 +1278,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } } -void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, - struct mlx5_pagefault *pfault) +static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { - struct mlx5_ib_dev *dev = context; u8 event_subtype = pfault->event_subtype; switch (event_subtype) { @@ -1197,6 +1296,203 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, } } +static void mlx5_ib_eqe_pf_action(struct work_struct *work) +{ + struct mlx5_pagefault *pfault = container_of(work, + struct mlx5_pagefault, + work); + struct mlx5_ib_pf_eq *eq = pfault->eq; + + mlx5_ib_pfault(eq->dev, pfault); + mempool_free(pfault, eq->pool); +} + +static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) +{ + struct mlx5_eqe_page_fault *pf_eqe; + struct mlx5_pagefault *pfault; + struct mlx5_eqe *eqe; + int cc = 0; + + while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) { + pfault = mempool_alloc(eq->pool, GFP_ATOMIC); + if (!pfault) { + schedule_work(&eq->work); + break; + } + + pf_eqe = &eqe->data.page_fault; + pfault->event_subtype = eqe->sub_type; + pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); + + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", + eqe->sub_type, pfault->bytes_committed); + + switch (eqe->sub_type) { + case MLX5_PFAULT_SUBTYPE_RDMA: + /* RDMA based event */ + pfault->type = + be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; + pfault->token = + be32_to_cpu(pf_eqe->rdma.pftype_token) & + MLX5_24BIT_MASK; + pfault->rdma.r_key = + be32_to_cpu(pf_eqe->rdma.r_key); + pfault->rdma.packet_size = + be16_to_cpu(pf_eqe->rdma.packet_length); + pfault->rdma.rdma_op_len = + be32_to_cpu(pf_eqe->rdma.rdma_op_len); + pfault->rdma.rdma_va = + be64_to_cpu(pf_eqe->rdma.rdma_va); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", + pfault->type, pfault->token, + pfault->rdma.r_key); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", + pfault->rdma.rdma_op_len, + pfault->rdma.rdma_va); + break; + + case MLX5_PFAULT_SUBTYPE_WQE: + /* WQE based event */ + pfault->type = + (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; + pfault->token = + be32_to_cpu(pf_eqe->wqe.token); + pfault->wqe.wq_num = + be32_to_cpu(pf_eqe->wqe.pftype_wq) & + MLX5_24BIT_MASK; + pfault->wqe.wqe_index = + be16_to_cpu(pf_eqe->wqe.wqe_index); + pfault->wqe.packet_size = + be16_to_cpu(pf_eqe->wqe.packet_length); + mlx5_ib_dbg(eq->dev, + "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", + pfault->type, pfault->token, + pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + default: + mlx5_ib_warn(eq->dev, + "Unsupported page fault event sub-type: 0x%02hhx\n", + eqe->sub_type); + /* Unsupported page faults should still be + * resolved by the page fault handler + */ + } + + pfault->eq = eq; + INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action); + queue_work(eq->wq, &pfault->work); + + cc = mlx5_eq_update_cc(eq->core, ++cc); + } + + mlx5_eq_update_ci(eq->core, cc, 1); +} + +static irqreturn_t mlx5_ib_eq_pf_int(int irq, void *eq_ptr) +{ + struct mlx5_ib_pf_eq *eq = eq_ptr; + unsigned long flags; + + if (spin_trylock_irqsave(&eq->lock, flags)) { + mlx5_ib_eq_pf_process(eq); + spin_unlock_irqrestore(&eq->lock, flags); + } else { + schedule_work(&eq->work); + } + + return IRQ_HANDLED; +} + +/* mempool_refill() was proposed but unfortunately wasn't accepted + * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html + * Cheap workaround. + */ +static void mempool_refill(mempool_t *pool) +{ + while (pool->curr_nr < pool->min_nr) + mempool_free(mempool_alloc(pool, GFP_KERNEL), pool); +} + +static void mlx5_ib_eq_pf_action(struct work_struct *work) +{ + struct mlx5_ib_pf_eq *eq = + container_of(work, struct mlx5_ib_pf_eq, work); + + mempool_refill(eq->pool); + + spin_lock_irq(&eq->lock); + mlx5_ib_eq_pf_process(eq); + spin_unlock_irq(&eq->lock); +} + +enum { + MLX5_IB_NUM_PF_EQE = 0x1000, + MLX5_IB_NUM_PF_DRAIN = 64, +}; + +static int +mlx5_ib_create_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) +{ + struct mlx5_eq_param param = {}; + int err; + + INIT_WORK(&eq->work, mlx5_ib_eq_pf_action); + spin_lock_init(&eq->lock); + eq->dev = dev; + + eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN, + sizeof(struct mlx5_pagefault)); + if (!eq->pool) + return -ENOMEM; + + eq->wq = alloc_workqueue("mlx5_ib_page_fault", + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, + MLX5_NUM_CMD_EQE); + if (!eq->wq) { + err = -ENOMEM; + goto err_mempool; + } + + param = (struct mlx5_eq_param) { + .index = MLX5_EQ_PFAULT_IDX, + .mask = 1 << MLX5_EVENT_TYPE_PAGE_FAULT, + .nent = MLX5_IB_NUM_PF_EQE, + .context = eq, + .handler = mlx5_ib_eq_pf_int + }; + eq->core = mlx5_eq_create_generic(dev->mdev, "mlx5_ib_page_fault_eq", ¶m); + if (IS_ERR(eq->core)) { + err = PTR_ERR(eq->core); + goto err_wq; + } + + return 0; +err_wq: + destroy_workqueue(eq->wq); +err_mempool: + mempool_destroy(eq->pool); + return err; +} + +static int +mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) +{ + int err; + + err = mlx5_eq_destroy_generic(dev->mdev, eq->core); + cancel_work_sync(&eq->work); + destroy_workqueue(eq->wq); + mempool_destroy(eq->pool); + + return err; +} + void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) { if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) @@ -1223,9 +1519,16 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) } } +static const struct ib_device_ops mlx5_ib_dev_odp_ops = { + .advise_mr = mlx5_ib_advise_mr, +}; + int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { - int ret; + int ret = 0; + + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) + ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); @@ -1235,7 +1538,20 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) } } - return 0; + if (!MLX5_CAP_GEN(dev->mdev, pg)) + return ret; + + ret = mlx5_ib_create_pf_eq(dev, &dev->odp_pf_eq); + + return ret; +} + +void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, pg)) + return; + + mlx5_ib_destroy_pf_eq(dev, &dev->odp_pf_eq); } int mlx5_ib_odp_init(void) @@ -1246,3 +1562,75 @@ int mlx5_ib_odp_init(void) return 0; } +struct prefetch_mr_work { + struct work_struct work; + struct mlx5_ib_dev *dev; + u32 pf_flags; + u32 num_sge; + struct ib_sge sg_list[0]; +}; + +static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags, + struct ib_sge *sg_list, u32 num_sge) +{ + int i; + + for (i = 0; i < num_sge; ++i) { + struct ib_sge *sg = &sg_list[i]; + int bytes_committed = 0; + int ret; + + ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr, + sg->length, + &bytes_committed, NULL, + pf_flags); + if (ret < 0) + return ret; + } + return 0; +} + +static void mlx5_ib_prefetch_mr_work(struct work_struct *work) +{ + struct prefetch_mr_work *w = + container_of(work, struct prefetch_mr_work, work); + + if (w->dev->ib_dev.reg_state == IB_DEV_REGISTERED) + mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list, + w->num_sge); + + kfree(w); +} + +int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; + struct prefetch_mr_work *work; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) + pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; + + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) + return mlx5_ib_prefetch_sg_list(dev, pf_flags, sg_list, + num_sge); + + if (dev->ib_dev.reg_state != IB_DEV_REGISTERED) + return -ENODEV; + + work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL); + if (!work) + return -ENOMEM; + + memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); + + work->dev = dev; + work->pf_flags = pf_flags; + work->num_sge = num_sge; + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + schedule_work(&work->work); + return 0; +} |