diff options
Diffstat (limited to 'drivers/nvme/host/tcp.c')
| -rw-r--r-- | drivers/nvme/host/tcp.c | 144 | 
1 files changed, 101 insertions, 43 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 606b13d35d16..4ffd5957637a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@  #include <net/tcp.h>  #include <linux/blk-mq.h>  #include <crypto/hash.h> +#include <net/busy_poll.h>  #include "nvme.h"  #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue {  	int			pdu_offset;  	size_t			data_remaining;  	size_t			ddgst_remaining; +	unsigned int		nr_cqe;  	/* send state */  	struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,  	}  	nvme_end_request(rq, cqe->status, cqe->result); +	queue->nr_cqe++;  	return 0;  } @@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,  	switch (hdr->type) {  	case nvme_tcp_c2h_data: -		ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); -		break; +		return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);  	case nvme_tcp_rsp:  		nvme_tcp_init_recv_ctx(queue); -		ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); -		break; +		return nvme_tcp_handle_comp(queue, (void *)queue->pdu);  	case nvme_tcp_r2t:  		nvme_tcp_init_recv_ctx(queue); -		ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); -		break; +		return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);  	default:  		dev_err(queue->ctrl->ctrl.device,  			"unsupported pdu type (%d)\n", hdr->type);  		return -EINVAL;  	} - -	return ret;  }  static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,  			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);  			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;  		} else { -			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) +			if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {  				nvme_tcp_end_request(rq, NVME_SC_SUCCESS); +				queue->nr_cqe++; +			}  			nvme_tcp_init_recv_ctx(queue);  		}  	} @@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,  						pdu->command_id);  		nvme_tcp_end_request(rq, NVME_SC_SUCCESS); +		queue->nr_cqe++;  	}  	nvme_tcp_init_recv_ctx(queue); @@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)  static void nvme_tcp_fail_request(struct nvme_tcp_request *req)  { -	nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); +	nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);  }  static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) @@ -1023,14 +1024,16 @@ done:  static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)  { -	struct sock *sk = queue->sock->sk; +	struct socket *sock = queue->sock; +	struct sock *sk = sock->sk;  	read_descriptor_t rd_desc;  	int consumed;  	rd_desc.arg.data = queue;  	rd_desc.count = 1;  	lock_sock(sk); -	consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); +	queue->nr_cqe = 0; +	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);  	release_sock(sk);  	return consumed;  } @@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	queue->queue_size = queue_size;  	if (qid > 0) -		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; +		queue->cmnd_capsule_len = nctrl->ioccsz * 16;  	else  		queue->cmnd_capsule_len = sizeof(struct nvme_command) +  						NVME_TCP_ADMIN_CCSZ; @@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,  			IPPROTO_TCP, &queue->sock);  	if (ret) { -		dev_err(ctrl->ctrl.device, +		dev_err(nctrl->device,  			"failed to create socket: %d\n", ret);  		return ret;  	} @@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,  			(char *)&opt, sizeof(opt));  	if (ret) { -		dev_err(ctrl->ctrl.device, +		dev_err(nctrl->device,  			"failed to set TCP_SYNCNT sock opt %d\n", ret);  		goto err_sock;  	} @@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,  			TCP_NODELAY, (char *)&opt, sizeof(opt));  	if (ret) { -		dev_err(ctrl->ctrl.device, +		dev_err(nctrl->device,  			"failed to set TCP_NODELAY sock opt %d\n", ret);  		goto err_sock;  	} @@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,  			(char *)&sol, sizeof(sol));  	if (ret) { -		dev_err(ctrl->ctrl.device, +		dev_err(nctrl->device,  			"failed to set SO_LINGER sock opt %d\n", ret);  		goto err_sock;  	} +	/* Set socket type of service */ +	if (nctrl->opts->tos >= 0) { +		opt = nctrl->opts->tos; +		ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, +				(char *)&opt, sizeof(opt)); +		if (ret) { +			dev_err(nctrl->device, +				"failed to set IP_TOS sock opt %d\n", ret); +			goto err_sock; +		} +	} +  	queue->sock->sk->sk_allocation = GFP_ATOMIC;  	if (!qid)  		n = 0; @@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	queue->pdu_offset = 0;  	sk_set_memalloc(queue->sock->sk); -	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { +	if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {  		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,  			sizeof(ctrl->src_addr));  		if (ret) { -			dev_err(ctrl->ctrl.device, +			dev_err(nctrl->device,  				"failed to bind queue %d socket %d\n",  				qid, ret);  			goto err_sock; @@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	if (queue->hdr_digest || queue->data_digest) {  		ret = nvme_tcp_alloc_crypto(queue);  		if (ret) { -			dev_err(ctrl->ctrl.device, +			dev_err(nctrl->device,  				"failed to allocate queue %d crypto\n", qid);  			goto err_sock;  		} @@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  		goto err_crypto;  	} -	dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", +	dev_dbg(nctrl->device, "connecting queue %d\n",  			nvme_tcp_queue_id(queue));  	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,  		sizeof(ctrl->addr), 0);  	if (ret) { -		dev_err(ctrl->ctrl.device, +		dev_err(nctrl->device,  			"failed to connect socket: %d\n", ret);  		goto err_rcv_pdu;  	} @@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,  	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;  	queue->sock->sk->sk_state_change = nvme_tcp_state_change;  	queue->sock->sk->sk_write_space = nvme_tcp_write_space; +	queue->sock->sk->sk_ll_usec = 1;  	write_unlock_bh(&queue->sock->sk->sk_callback_lock);  	return 0; @@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,  		set->driver_data = ctrl;  		set->nr_hw_queues = nctrl->queue_count - 1;  		set->timeout = NVME_IO_TIMEOUT; -		set->nr_maps = 2 /* default + read */; +		set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;  	}  	ret = blk_mq_alloc_tag_set(set); @@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)  	nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());  	nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); +	nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());  	return nr_io_queues;  } @@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,  			min(opts->nr_io_queues, nr_io_queues);  		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];  	} + +	if (opts->nr_poll_queues && nr_io_queues) { +		/* map dedicated poll queues only if we have queues left */ +		ctrl->io_queues[HCTX_TYPE_POLL] = +			min(opts->nr_poll_queues, nr_io_queues); +	}  }  static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)  	nvme_tcp_stop_queue(ctrl, 0);  	if (remove) {  		blk_cleanup_queue(ctrl->admin_q); +		blk_cleanup_queue(ctrl->fabrics_q);  		blk_mq_free_tag_set(ctrl->admin_tagset);  	}  	nvme_tcp_free_admin_queue(ctrl); @@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)  			goto out_free_queue;  		} +		ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); +		if (IS_ERR(ctrl->fabrics_q)) { +			error = PTR_ERR(ctrl->fabrics_q); +			goto out_free_tagset; +		} +  		ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);  		if (IS_ERR(ctrl->admin_q)) {  			error = PTR_ERR(ctrl->admin_q); -			goto out_free_tagset; +			goto out_cleanup_fabrics_q;  		}  	} @@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)  	if (error)  		goto out_cleanup_queue; -	error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); -	if (error) { -		dev_err(ctrl->device, -			"prop_get NVME_REG_CAP failed\n"); -		goto out_stop_queue; -	} - -	ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - -	error = nvme_enable_ctrl(ctrl, ctrl->cap); +	error = nvme_enable_ctrl(ctrl);  	if (error)  		goto out_stop_queue; +	blk_mq_unquiesce_queue(ctrl->admin_q); +  	error = nvme_init_identify(ctrl);  	if (error)  		goto out_stop_queue; @@ -1735,6 +1758,9 @@ out_stop_queue:  out_cleanup_queue:  	if (new)  		blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: +	if (new) +		blk_cleanup_queue(ctrl->fabrics_q);  out_free_tagset:  	if (new)  		blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,  {  	blk_mq_quiesce_queue(ctrl->admin_q);  	nvme_tcp_stop_queue(ctrl, 0); -	if (ctrl->admin_tagset) +	if (ctrl->admin_tagset) {  		blk_mq_tagset_busy_iter(ctrl->admin_tagset,  			nvme_cancel_request, ctrl); -	blk_mq_unquiesce_queue(ctrl->admin_q); +		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); +	} +	if (remove) +		blk_mq_unquiesce_queue(ctrl->admin_q);  	nvme_tcp_destroy_admin_queue(ctrl, remove);  } @@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,  		return;  	nvme_stop_queues(ctrl);  	nvme_tcp_stop_io_queues(ctrl); -	if (ctrl->tagset) +	if (ctrl->tagset) {  		blk_mq_tagset_busy_iter(ctrl->tagset,  			nvme_cancel_request, ctrl); +		blk_mq_tagset_wait_completed_request(ctrl->tagset); +	}  	if (remove)  		nvme_start_queues(ctrl);  	nvme_tcp_destroy_io_queues(ctrl, remove); @@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)  static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)  {  	struct nvmf_ctrl_options *opts = ctrl->opts; -	int ret = -EINVAL; +	int ret;  	ret = nvme_tcp_configure_admin_queue(ctrl, new);  	if (ret) @@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)  	/* unquiesce to fail fast pending requests */  	nvme_start_queues(ctrl);  	nvme_tcp_teardown_admin_queue(ctrl, false); +	blk_mq_unquiesce_queue(ctrl->admin_q);  	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {  		/* state change failure is ok if we're in DELETING state */ @@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)  	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);  	nvme_tcp_teardown_io_queues(ctrl, shutdown); +	blk_mq_quiesce_queue(ctrl->admin_q);  	if (shutdown)  		nvme_shutdown_ctrl(ctrl);  	else -		nvme_disable_ctrl(ctrl, ctrl->cap); +		nvme_disable_ctrl(ctrl);  	nvme_tcp_teardown_admin_queue(ctrl, shutdown);  } @@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)  	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);  	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); +	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { +		/* map dedicated poll queues only if we have queues left */ +		set->map[HCTX_TYPE_POLL].nr_queues = +				ctrl->io_queues[HCTX_TYPE_POLL]; +		set->map[HCTX_TYPE_POLL].queue_offset = +			ctrl->io_queues[HCTX_TYPE_DEFAULT] + +			ctrl->io_queues[HCTX_TYPE_READ]; +		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); +	} +  	dev_info(ctrl->ctrl.device, -		"mapped %d/%d default/read queues.\n", +		"mapped %d/%d/%d default/read/poll queues.\n",  		ctrl->io_queues[HCTX_TYPE_DEFAULT], -		ctrl->io_queues[HCTX_TYPE_READ]); +		ctrl->io_queues[HCTX_TYPE_READ], +		ctrl->io_queues[HCTX_TYPE_POLL]);  	return 0;  } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ +	struct nvme_tcp_queue *queue = hctx->driver_data; +	struct sock *sk = queue->sock->sk; + +	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) +		sk_busy_loop(sk, true); +	nvme_tcp_try_recv(queue); +	return queue->nr_cqe; +} +  static struct blk_mq_ops nvme_tcp_mq_ops = {  	.queue_rq	= nvme_tcp_queue_rq,  	.complete	= nvme_complete_rq, @@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {  	.init_hctx	= nvme_tcp_init_hctx,  	.timeout	= nvme_tcp_timeout,  	.map_queues	= nvme_tcp_map_queues, +	.poll		= nvme_tcp_poll,  };  static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,  	INIT_LIST_HEAD(&ctrl->list);  	ctrl->ctrl.opts = opts; -	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; +	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + +				opts->nr_poll_queues + 1;  	ctrl->ctrl.sqsize = opts->queue_size - 1;  	ctrl->ctrl.kato = opts->kato; @@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {  	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |  			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |  			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | -			  NVMF_OPT_NR_WRITE_QUEUES, +			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | +			  NVMF_OPT_TOS,  	.create_ctrl	= nvme_tcp_create_ctrl,  };  |