aboutsummaryrefslogtreecommitdiff
path: root/drivers/nvme/host/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r--drivers/nvme/host/core.c281
1 files changed, 174 insertions, 107 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 762125f2905f..dfd9dec0c1f6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -57,6 +57,26 @@ static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+static unsigned long apst_primary_timeout_ms = 100;
+module_param(apst_primary_timeout_ms, ulong, 0644);
+MODULE_PARM_DESC(apst_primary_timeout_ms,
+ "primary APST timeout in ms");
+
+static unsigned long apst_secondary_timeout_ms = 2000;
+module_param(apst_secondary_timeout_ms, ulong, 0644);
+MODULE_PARM_DESC(apst_secondary_timeout_ms,
+ "secondary APST timeout in ms");
+
+static unsigned long apst_primary_latency_tol_us = 15000;
+module_param(apst_primary_latency_tol_us, ulong, 0644);
+MODULE_PARM_DESC(apst_primary_latency_tol_us,
+ "primary APST latency tolerance in us");
+
+static unsigned long apst_secondary_latency_tol_us = 100000;
+module_param(apst_secondary_latency_tol_us, ulong, 0644);
+MODULE_PARM_DESC(apst_secondary_latency_tol_us,
+ "secondary APST latency tolerance in us");
+
static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
@@ -589,6 +609,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
static inline void nvme_clear_nvme_request(struct request *req)
{
+ nvme_req(req)->status = 0;
nvme_req(req)->retries = 0;
nvme_req(req)->flags = 0;
req->rq_flags |= RQF_DONTPREP;
@@ -611,6 +632,8 @@ static inline void nvme_init_request(struct request *req,
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
req->cmd_flags |= REQ_FAILFAST_DRIVER;
+ if (req->mq_hctx->type == HCTX_TYPE_POLL)
+ req->cmd_flags |= REQ_HIPRI;
nvme_clear_nvme_request(req);
memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
}
@@ -701,9 +724,7 @@ EXPORT_SYMBOL_GPL(__nvme_check_ready);
static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
{
- struct nvme_command c;
-
- memset(&c, 0, sizeof(c));
+ struct nvme_command c = { };
c.directive.opcode = nvme_admin_directive_send;
c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
@@ -728,9 +749,8 @@ static int nvme_enable_streams(struct nvme_ctrl *ctrl)
static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
struct streams_directive_params *s, u32 nsid)
{
- struct nvme_command c;
+ struct nvme_command c = { };
- memset(&c, 0, sizeof(c));
memset(s, 0, sizeof(*s));
c.directive.opcode = nvme_admin_directive_recv;
@@ -880,7 +900,10 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
- cmnd->write_zeroes.control = 0;
+ if (nvme_ns_has_pi(ns))
+ cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
+ else
+ cmnd->write_zeroes.control = 0;
return BLK_STS_OK;
}
@@ -1012,29 +1035,23 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
-static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
-{
- struct completion *waiting = rq->end_io_data;
-
- rq->end_io_data = NULL;
- complete(waiting);
-}
-
-static void nvme_execute_rq_polled(struct request_queue *q,
- struct gendisk *bd_disk, struct request *rq, int at_head)
+/*
+ * Return values:
+ * 0: success
+ * >0: nvme controller's cqe status response
+ * <0: kernel error in lieu of controller response
+ */
+static int nvme_execute_rq(struct gendisk *disk, struct request *rq,
+ bool at_head)
{
- DECLARE_COMPLETION_ONSTACK(wait);
-
- WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
-
- rq->cmd_flags |= REQ_HIPRI;
- rq->end_io_data = &wait;
- blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq);
+ blk_status_t status;
- while (!completion_done(&wait)) {
- blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
- cond_resched();
- }
+ status = blk_execute_rq(disk, rq, at_head);
+ if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+ return -EINTR;
+ if (nvme_req(rq)->status)
+ return nvme_req(rq)->status;
+ return blk_status_to_errno(status);
}
/*
@@ -1044,7 +1061,7 @@ static void nvme_execute_rq_polled(struct request_queue *q,
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags, bool poll)
+ blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
@@ -1065,16 +1082,9 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
goto out;
}
- if (poll)
- nvme_execute_rq_polled(req->q, NULL, req, at_head);
- else
- blk_execute_rq(NULL, req, at_head);
- if (result)
+ ret = nvme_execute_rq(NULL, req, at_head);
+ if (result && ret >= 0)
*result = nvme_req(req)->result;
- if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
- ret = -EINTR;
- else
- ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
@@ -1085,7 +1095,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
- NVME_QID_ANY, 0, 0, false);
+ NVME_QID_ANY, 0, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -1162,18 +1172,21 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
}
}
-void nvme_execute_passthru_rq(struct request *rq)
+int nvme_execute_passthru_rq(struct request *rq)
{
struct nvme_command *cmd = nvme_req(rq)->cmd;
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
u32 effects;
+ int ret;
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
- blk_execute_rq(disk, rq, 0);
+ ret = nvme_execute_rq(disk, rq, false);
if (effects) /* nothing to be done for zero cmd effects */
nvme_passthru_end(ctrl, effects);
+
+ return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
@@ -1440,16 +1453,15 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
union nvme_result res = { 0 };
- struct nvme_command c;
+ struct nvme_command c = { };
int ret;
- memset(&c, 0, sizeof(c));
c.features.opcode = op;
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
- buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
@@ -1522,36 +1534,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
queue_work(nvme_wq, &ctrl->async_event_work);
}
-/*
- * Issue ioctl requests on the first available path. Note that unlike normal
- * block layer requests we will not retry failed request on another controller.
- */
-struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
- struct nvme_ns_head **head, int *srcu_idx)
-{
-#ifdef CONFIG_NVME_MULTIPATH
- if (disk->fops == &nvme_ns_head_ops) {
- struct nvme_ns *ns;
-
- *head = disk->private_data;
- *srcu_idx = srcu_read_lock(&(*head)->srcu);
- ns = nvme_find_path(*head);
- if (!ns)
- srcu_read_unlock(&(*head)->srcu, *srcu_idx);
- return ns;
- }
-#endif
- *head = NULL;
- *srcu_idx = -1;
- return disk->private_data;
-}
-
-void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
-{
- if (head)
- srcu_read_unlock(&head->srcu, idx);
-}
-
static int nvme_ns_open(struct nvme_ns *ns)
{
@@ -1601,9 +1583,8 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
u32 max_integrity_segments)
{
- struct blk_integrity integrity;
+ struct blk_integrity integrity = { };
- memset(&integrity, 0, sizeof(integrity));
switch (pi_type) {
case NVME_NS_DPS_PI_TYPE3:
integrity.profile = &t10_pi_type3_crc;
@@ -1948,30 +1929,45 @@ static char nvme_pr_type(enum pr_type type)
}
};
+static int nvme_send_ns_head_pr_command(struct block_device *bdev,
+ struct nvme_command *c, u8 data[16])
+{
+ struct nvme_ns_head *head = bdev->bd_disk->private_data;
+ int srcu_idx = srcu_read_lock(&head->srcu);
+ struct nvme_ns *ns = nvme_find_path(head);
+ int ret = -EWOULDBLOCK;
+
+ if (ns) {
+ c->common.nsid = cpu_to_le32(ns->head->ns_id);
+ ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
+ }
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
+static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
+ u8 data[16])
+{
+ c->common.nsid = cpu_to_le32(ns->head->ns_id);
+ return nvme_submit_sync_cmd(ns->queue, c, data, 16);
+}
+
static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
u64 key, u64 sa_key, u8 op)
{
- struct nvme_ns_head *head = NULL;
- struct nvme_ns *ns;
- struct nvme_command c;
- int srcu_idx, ret;
+ struct nvme_command c = { };
u8 data[16] = { 0, };
- ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
- if (unlikely(!ns))
- return -EWOULDBLOCK;
-
put_unaligned_le64(key, &data[0]);
put_unaligned_le64(sa_key, &data[8]);
- memset(&c, 0, sizeof(c));
c.common.opcode = op;
- c.common.nsid = cpu_to_le32(ns->head->ns_id);
c.common.cdw10 = cpu_to_le32(cdw10);
- ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
- nvme_put_ns_from_disk(head, srcu_idx);
- return ret;
+ if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
+ bdev->bd_disk->fops == &nvme_ns_head_ops)
+ return nvme_send_ns_head_pr_command(bdev, &c, data);
+ return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data);
}
static int nvme_pr_register(struct block_device *bdev, u64 old,
@@ -2036,9 +2032,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send)
{
struct nvme_ctrl *ctrl = data;
- struct nvme_command cmd;
+ struct nvme_command cmd = { };
- memset(&cmd, 0, sizeof(cmd));
if (send)
cmd.common.opcode = nvme_admin_security_send;
else
@@ -2048,11 +2043,22 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
cmd.common.cdw11 = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
- NVME_QID_ANY, 1, 0, false);
+ NVME_QID_ANY, 1, 0);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
#endif /* CONFIG_BLK_SED_OPAL */
+#ifdef CONFIG_BLK_DEV_ZONED
+static int nvme_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
+ data);
+}
+#else
+#define nvme_report_zones NULL
+#endif /* CONFIG_BLK_DEV_ZONED */
+
static const struct block_device_operations nvme_bdev_ops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
@@ -2218,13 +2224,53 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl)
}
/*
+ * The function checks whether the given total (exlat + enlat) latency of
+ * a power state allows the latter to be used as an APST transition target.
+ * It does so by comparing the latency to the primary and secondary latency
+ * tolerances defined by module params. If there's a match, the corresponding
+ * timeout value is returned and the matching tolerance index (1 or 2) is
+ * reported.
+ */
+static bool nvme_apst_get_transition_time(u64 total_latency,
+ u64 *transition_time, unsigned *last_index)
+{
+ if (total_latency <= apst_primary_latency_tol_us) {
+ if (*last_index == 1)
+ return false;
+ *last_index = 1;
+ *transition_time = apst_primary_timeout_ms;
+ return true;
+ }
+ if (apst_secondary_timeout_ms &&
+ total_latency <= apst_secondary_latency_tol_us) {
+ if (*last_index <= 2)
+ return false;
+ *last_index = 2;
+ *transition_time = apst_secondary_timeout_ms;
+ return true;
+ }
+ return false;
+}
+
+/*
* APST (Autonomous Power State Transition) lets us program a table of power
* state transitions that the controller will perform automatically.
- * We configure it with a simple heuristic: we are willing to spend at most 2%
- * of the time transitioning between power states. Therefore, when running in
- * any given state, we will enter the next lower-power non-operational state
- * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit
- * latency is under the requested maximum latency.
+ *
+ * Depending on module params, one of the two supported techniques will be used:
+ *
+ * - If the parameters provide explicit timeouts and tolerances, they will be
+ * used to build a table with up to 2 non-operational states to transition to.
+ * The default parameter values were selected based on the values used by
+ * Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
+ * regeneration of the APST table in the event of switching between external
+ * and battery power, the timeouts and tolerances reflect a compromise
+ * between values used by Microsoft for AC and battery scenarios.
+ * - If not, we'll configure the table with a simple heuristic: we are willing
+ * to spend at most 2% of the time transitioning between power states.
+ * Therefore, when running in any given state, we will enter the next
+ * lower-power non-operational state after waiting 50 * (enlat + exlat)
+ * microseconds, as long as that state's exit latency is under the requested
+ * maximum latency.
*
* We will not autonomously enter any non-operational state for which the total
* latency exceeds ps_max_latency_us.
@@ -2240,6 +2286,7 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl)
int max_ps = -1;
int state;
int ret;
+ unsigned last_lt_index = UINT_MAX;
/*
* If APST isn't supported or if we haven't been initialized yet,
@@ -2298,13 +2345,19 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl)
le32_to_cpu(ctrl->psd[state].entry_lat);
/*
- * This state is good. Use it as the APST idle target for
- * higher power states.
+ * This state is good. It can be used as the APST idle target
+ * for higher power states.
*/
- transition_ms = total_latency_us + 19;
- do_div(transition_ms, 20);
- if (transition_ms > (1 << 24) - 1)
- transition_ms = (1 << 24) - 1;
+ if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
+ if (!nvme_apst_get_transition_time(total_latency_us,
+ &transition_ms, &last_lt_index))
+ continue;
+ } else {
+ transition_ms = total_latency_us + 19;
+ do_div(transition_ms, 20);
+ if (transition_ms > (1 << 24) - 1)
+ transition_ms = (1 << 24) - 1;
+ }
target = cpu_to_le64((state << 3) | (transition_ms << 8));
if (max_ps == -1)
@@ -3485,8 +3538,10 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
cdev_init(cdev, fops);
cdev->owner = owner;
ret = cdev_device_add(cdev, cdev_device);
- if (ret)
+ if (ret) {
+ put_device(cdev_device);
ida_simple_remove(&nvme_ns_chr_minor_ida, minor);
+ }
return ret;
}
@@ -3699,7 +3754,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
disk->fops = &nvme_bdev_ops;
disk->private_data = ns;
disk->queue = ns->queue;
- disk->flags = GENHD_FL_EXT_DEVT;
/*
* Without the multipath code enabled, multiple controller per
* subsystems are visible as devices and thus we cannot use the
@@ -3756,6 +3810,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
static void nvme_ns_remove(struct nvme_ns *ns)
{
+ bool last_path = false;
+
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
@@ -3764,8 +3820,6 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
- if (list_empty(&ns->head->list))
- list_del_init(&ns->head->entry);
mutex_unlock(&ns->ctrl->subsys->lock);
synchronize_rcu(); /* guarantee not available in head->list */
@@ -3785,7 +3839,15 @@ static void nvme_ns_remove(struct nvme_ns *ns)
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
- nvme_mpath_check_last_path(ns);
+ /* Synchronize with nvme_init_ns_head() */
+ mutex_lock(&ns->head->subsys->lock);
+ if (list_empty(&ns->head->list)) {
+ list_del_init(&ns->head->entry);
+ last_path = true;
+ }
+ mutex_unlock(&ns->head->subsys->lock);
+ if (last_path)
+ nvme_mpath_shutdown_disk(ns->head);
nvme_put_ns(ns);
}
@@ -4067,6 +4129,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
opts->host_traddr ?: "none");
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
+ opts->host_iface ?: "none");
}
return ret;
}