diff options
Diffstat (limited to 'drivers/nvme/host/rdma.c')
| -rw-r--r-- | drivers/nvme/host/rdma.c | 280 | 
1 files changed, 137 insertions, 143 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4f9bf2f815c3..2a0bba7f50cf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -15,6 +15,7 @@  #include <linux/module.h>  #include <linux/init.h>  #include <linux/slab.h> +#include <rdma/mr_pool.h>  #include <linux/err.h>  #include <linux/string.h>  #include <linux/atomic.h> @@ -59,6 +60,9 @@ struct nvme_rdma_request {  	struct nvme_request	req;  	struct ib_mr		*mr;  	struct nvme_rdma_qe	sqe; +	union nvme_result	result; +	__le16			status; +	refcount_t		ref;  	struct ib_sge		sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];  	u32			num_sge;  	int			nents; @@ -73,11 +77,11 @@ struct nvme_rdma_request {  enum nvme_rdma_queue_flags {  	NVME_RDMA_Q_ALLOCATED		= 0,  	NVME_RDMA_Q_LIVE		= 1, +	NVME_RDMA_Q_TR_READY		= 2,  };  struct nvme_rdma_queue {  	struct nvme_rdma_qe	*rsp_ring; -	atomic_t		sig_count;  	int			queue_size;  	size_t			cmnd_capsule_len;  	struct nvme_rdma_ctrl	*ctrl; @@ -258,32 +262,6 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)  	return ret;  } -static int nvme_rdma_reinit_request(void *data, struct request *rq) -{ -	struct nvme_rdma_ctrl *ctrl = data; -	struct nvme_rdma_device *dev = ctrl->device; -	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); -	int ret = 0; - -	if (WARN_ON_ONCE(!req->mr)) -		return 0; - -	ib_dereg_mr(req->mr); - -	req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, -			ctrl->max_fr_pages); -	if (IS_ERR(req->mr)) { -		ret = PTR_ERR(req->mr); -		req->mr = NULL; -		goto out; -	} - -	req->mr->need_inval = false; - -out: -	return ret; -} -  static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,  		struct request *rq, unsigned int hctx_idx)  { @@ -293,9 +271,6 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,  	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];  	struct nvme_rdma_device *dev = queue->device; -	if (req->mr) -		ib_dereg_mr(req->mr); -  	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),  			DMA_TO_DEVICE);  } @@ -317,21 +292,9 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,  	if (ret)  		return ret; -	req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, -			ctrl->max_fr_pages); -	if (IS_ERR(req->mr)) { -		ret = PTR_ERR(req->mr); -		goto out_free_qe; -	} -  	req->queue = queue;  	return 0; - -out_free_qe: -	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), -			DMA_TO_DEVICE); -	return -ENOMEM;  }  static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -428,10 +391,23 @@ out_err:  static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)  { -	struct nvme_rdma_device *dev = queue->device; -	struct ib_device *ibdev = dev->dev; +	struct nvme_rdma_device *dev; +	struct ib_device *ibdev; -	rdma_destroy_qp(queue->cm_id); +	if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags)) +		return; + +	dev = queue->device; +	ibdev = dev->dev; + +	ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs); + +	/* +	 * The cm_id object might have been destroyed during RDMA connection +	 * establishment error flow to avoid getting other cma events, thus +	 * the destruction of the QP shouldn't use rdma_cm API. +	 */ +	ib_destroy_qp(queue->qp);  	ib_free_cq(queue->ib_cq);  	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, @@ -440,6 +416,12 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)  	nvme_rdma_dev_put(dev);  } +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) +{ +	return min_t(u32, NVME_RDMA_MAX_SEGMENTS, +		     ibdev->attrs.max_fast_reg_page_list_len); +} +  static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)  {  	struct ib_device *ibdev; @@ -482,8 +464,24 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)  		goto out_destroy_qp;  	} +	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, +			      queue->queue_size, +			      IB_MR_TYPE_MEM_REG, +			      nvme_rdma_get_max_fr_pages(ibdev)); +	if (ret) { +		dev_err(queue->ctrl->ctrl.device, +			"failed to initialize MR pool sized %d for QID %d\n", +			queue->queue_size, idx); +		goto out_destroy_ring; +	} + +	set_bit(NVME_RDMA_Q_TR_READY, &queue->flags); +  	return 0; +out_destroy_ring: +	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, +			    sizeof(struct nvme_completion), DMA_FROM_DEVICE);  out_destroy_qp:  	rdma_destroy_qp(queue->cm_id);  out_destroy_ib_cq: @@ -510,7 +508,6 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,  		queue->cmnd_capsule_len = sizeof(struct nvme_command);  	queue->queue_size = queue_size; -	atomic_set(&queue->sig_count, 0);  	queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,  			RDMA_PS_TCP, IB_QPT_RC); @@ -546,6 +543,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,  out_destroy_cm_id:  	rdma_destroy_id(queue->cm_id); +	nvme_rdma_destroy_queue_ib(queue);  	return ret;  } @@ -756,8 +754,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,  	ctrl->device = ctrl->queues[0].device; -	ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, -		ctrl->device->dev->attrs.max_fast_reg_page_list_len); +	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);  	if (new) {  		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); @@ -771,10 +768,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,  			error = PTR_ERR(ctrl->ctrl.admin_q);  			goto out_free_tagset;  		} -	} else { -		error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); -		if (error) -			goto out_free_queue;  	}  	error = nvme_rdma_start_queue(ctrl, 0); @@ -854,10 +847,6 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)  			goto out_free_tag_set;  		}  	} else { -		ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); -		if (ret) -			goto out_free_io_queues; -  		blk_mq_update_nr_hw_queues(&ctrl->tag_set,  			ctrl->ctrl.queue_count - 1);  	} @@ -985,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)  	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);  	nvme_start_queues(&ctrl->ctrl); +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { +		/* state change failure should never happen */ +		WARN_ON_ONCE(1); +		return; +	} +  	nvme_rdma_reconnect_or_remove(ctrl);  }  static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)  { -	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))  		return;  	queue_work(nvme_wq, &ctrl->err_work); @@ -1018,8 +1013,18 @@ static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)  static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)  { -	if (unlikely(wc->status != IB_WC_SUCCESS)) +	struct nvme_rdma_request *req = +		container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe); +	struct request *rq = blk_mq_rq_from_pdu(req); + +	if (unlikely(wc->status != IB_WC_SUCCESS)) {  		nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); +		return; +	} + +	if (refcount_dec_and_test(&req->ref)) +		nvme_end_request(rq, req->status, req->result); +  }  static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, @@ -1030,7 +1035,7 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,  		.opcode		    = IB_WR_LOCAL_INV,  		.next		    = NULL,  		.num_sge	    = 0, -		.send_flags	    = 0, +		.send_flags	    = IB_SEND_SIGNALED,  		.ex.invalidate_rkey = req->mr->rkey,  	}; @@ -1044,22 +1049,15 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,  		struct request *rq)  {  	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); -	struct nvme_rdma_ctrl *ctrl = queue->ctrl;  	struct nvme_rdma_device *dev = queue->device;  	struct ib_device *ibdev = dev->dev; -	int res;  	if (!blk_rq_bytes(rq))  		return; -	if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { -		res = nvme_rdma_inv_rkey(queue, req); -		if (unlikely(res < 0)) { -			dev_err(ctrl->ctrl.device, -				"Queueing INV WR for rkey %#x failed (%d)\n", -				req->mr->rkey, res); -			nvme_rdma_error_recovery(queue->ctrl); -		} +	if (req->mr) { +		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); +		req->mr = NULL;  	}  	ib_dma_unmap_sg(ibdev, req->sg_table.sgl, @@ -1118,12 +1116,18 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,  	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;  	int nr; +	req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs); +	if (WARN_ON_ONCE(!req->mr)) +		return -EAGAIN; +  	/*  	 * Align the MR to a 4K page size to match the ctrl page size and  	 * the block virtual boundary.  	 */  	nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);  	if (unlikely(nr < count)) { +		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr); +		req->mr = NULL;  		if (nr < 0)  			return nr;  		return -EINVAL; @@ -1142,8 +1146,6 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,  			     IB_ACCESS_REMOTE_READ |  			     IB_ACCESS_REMOTE_WRITE; -	req->mr->need_inval = true; -  	sg->addr = cpu_to_le64(req->mr->iova);  	put_unaligned_le24(req->mr->length, sg->length);  	put_unaligned_le32(req->mr->rkey, sg->key); @@ -1163,7 +1165,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,  	req->num_sge = 1;  	req->inline_data = false; -	req->mr->need_inval = false; +	refcount_set(&req->ref, 2); /* send and recv completions */  	c->common.flags |= NVME_CMD_SGL_METABUF; @@ -1200,25 +1202,24 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,  static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)  { -	if (unlikely(wc->status != IB_WC_SUCCESS)) -		nvme_rdma_wr_error(cq, wc, "SEND"); -} +	struct nvme_rdma_qe *qe = +		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); +	struct nvme_rdma_request *req = +		container_of(qe, struct nvme_rdma_request, sqe); +	struct request *rq = blk_mq_rq_from_pdu(req); -/* - * We want to signal completion at least every queue depth/2.  This returns the - * largest power of two that is not above half of (queue size + 1) to optimize - * (avoid divisions). - */ -static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) -{ -	int limit = 1 << ilog2((queue->queue_size + 1) / 2); +	if (unlikely(wc->status != IB_WC_SUCCESS)) { +		nvme_rdma_wr_error(cq, wc, "SEND"); +		return; +	} -	return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; +	if (refcount_dec_and_test(&req->ref)) +		nvme_end_request(rq, req->status, req->result);  }  static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,  		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, -		struct ib_send_wr *first, bool flush) +		struct ib_send_wr *first)  {  	struct ib_send_wr wr, *bad_wr;  	int ret; @@ -1227,31 +1228,12 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,  	sge->length = sizeof(struct nvme_command),  	sge->lkey   = queue->device->pd->local_dma_lkey; -	qe->cqe.done = nvme_rdma_send_done; -  	wr.next       = NULL;  	wr.wr_cqe     = &qe->cqe;  	wr.sg_list    = sge;  	wr.num_sge    = num_sge;  	wr.opcode     = IB_WR_SEND; -	wr.send_flags = 0; - -	/* -	 * Unsignalled send completions are another giant desaster in the -	 * IB Verbs spec:  If we don't regularly post signalled sends -	 * the send queue will fill up and only a QP reset will rescue us. -	 * Would have been way to obvious to handle this in hardware or -	 * at least the RDMA stack.. -	 * -	 * Always signal the flushes. The magic request used for the flush -	 * sequencer is not allocated in our driver's tagset and it's -	 * triggered to be freed by blk_cleanup_queue(). So we need to -	 * always mark it as signaled to ensure that the "wr_cqe", which is -	 * embedded in request's payload, is not freed when __ib_process_cq() -	 * calls wr_cqe->done(). -	 */ -	if (nvme_rdma_queue_sig_limit(queue) || flush) -		wr.send_flags |= IB_SEND_SIGNALED; +	wr.send_flags = IB_SEND_SIGNALED;  	if (first)  		first->next = ≀ @@ -1301,6 +1283,12 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)  	return queue->ctrl->tag_set.tags[queue_idx - 1];  } +static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc) +{ +	if (unlikely(wc->status != IB_WC_SUCCESS)) +		nvme_rdma_wr_error(cq, wc, "ASYNC"); +} +  static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)  {  	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); @@ -1319,10 +1307,12 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)  	cmd->common.flags |= NVME_CMD_SGL_METABUF;  	nvme_rdma_set_sg_null(cmd); +	sqe->cqe.done = nvme_rdma_async_done; +  	ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),  			DMA_TO_DEVICE); -	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); +	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);  	WARN_ON_ONCE(ret);  } @@ -1343,14 +1333,34 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,  	}  	req = blk_mq_rq_to_pdu(rq); -	if (rq->tag == tag) -		ret = 1; +	req->status = cqe->status; +	req->result = cqe->result; -	if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && -	    wc->ex.invalidate_rkey == req->mr->rkey) -		req->mr->need_inval = false; +	if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { +		if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { +			dev_err(queue->ctrl->ctrl.device, +				"Bogus remote invalidation for rkey %#x\n", +				req->mr->rkey); +			nvme_rdma_error_recovery(queue->ctrl); +		} +	} else if (req->mr) { +		ret = nvme_rdma_inv_rkey(queue, req); +		if (unlikely(ret < 0)) { +			dev_err(queue->ctrl->ctrl.device, +				"Queueing INV WR for rkey %#x failed (%d)\n", +				req->mr->rkey, ret); +			nvme_rdma_error_recovery(queue->ctrl); +		} +		/* the local invalidation completion will end the request */ +		return 0; +	} + +	if (refcount_dec_and_test(&req->ref)) { +		if (rq->tag == tag) +			ret = 1; +		nvme_end_request(rq, req->status, req->result); +	} -	nvme_end_request(rq, cqe->status, cqe->result);  	return ret;  } @@ -1591,31 +1601,11 @@ nvme_rdma_timeout(struct request *rq, bool reserved)   * We cannot accept any other command until the Connect command has completed.   */  static inline blk_status_t -nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) -{ -	if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { -		struct nvme_command *cmd = nvme_req(rq)->cmd; - -		if (!blk_rq_is_passthrough(rq) || -		    cmd->common.opcode != nvme_fabrics_command || -		    cmd->fabrics.fctype != nvme_fabrics_type_connect) { -			/* -			 * reconnecting state means transport disruption, which -			 * can take a long time and even might fail permanently, -			 * fail fast to give upper layers a chance to failover. -			 * deleting state means that the ctrl will never accept -			 * commands again, fail it permanently. -			 */ -			if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING || -			    queue->ctrl->ctrl.state == NVME_CTRL_DELETING) { -				nvme_req(rq)->status = NVME_SC_ABORT_REQ; -				return BLK_STS_IOERR; -			} -			return BLK_STS_RESOURCE; /* try again later */ -		} -	} - -	return 0; +nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq) +{ +	if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) +		return nvmf_check_init_req(&queue->ctrl->ctrl, rq); +	return BLK_STS_OK;  }  static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -1627,14 +1617,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,  	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);  	struct nvme_rdma_qe *sqe = &req->sqe;  	struct nvme_command *c = sqe->data; -	bool flush = false;  	struct ib_device *dev;  	blk_status_t ret;  	int err;  	WARN_ON_ONCE(rq->tag < 0); -	ret = nvme_rdma_queue_is_ready(queue, rq); +	ret = nvme_rdma_is_ready(queue, rq);  	if (unlikely(ret))  		return ret; @@ -1656,13 +1645,13 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,  		goto err;  	} +	sqe->cqe.done = nvme_rdma_send_done; +  	ib_dma_sync_single_for_device(dev, sqe->dma,  			sizeof(struct nvme_command), DMA_TO_DEVICE); -	if (req_op(rq) == REQ_OP_FLUSH) -		flush = true;  	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, -			req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); +			req->mr ? &req->reg_wr.wr : NULL);  	if (unlikely(err)) {  		nvme_rdma_unmap_data(queue, rq);  		goto err; @@ -1770,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)  	nvme_stop_ctrl(&ctrl->ctrl);  	nvme_rdma_shutdown_ctrl(ctrl, false); +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { +		/* state change failure should never happen */ +		WARN_ON_ONCE(1); +		return; +	} +  	ret = nvme_rdma_configure_admin_queue(ctrl, false);  	if (ret)  		goto out_fail; @@ -1810,7 +1805,6 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {  	.submit_async_event	= nvme_rdma_submit_async_event,  	.delete_ctrl		= nvme_rdma_delete_ctrl,  	.get_address		= nvmf_get_address, -	.reinit_request		= nvme_rdma_reinit_request,  };  static inline bool  |