diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 281 |
1 files changed, 174 insertions, 107 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 762125f2905f..dfd9dec0c1f6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -57,6 +57,26 @@ static bool force_apst; module_param(force_apst, bool, 0644); MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); +static unsigned long apst_primary_timeout_ms = 100; +module_param(apst_primary_timeout_ms, ulong, 0644); +MODULE_PARM_DESC(apst_primary_timeout_ms, + "primary APST timeout in ms"); + +static unsigned long apst_secondary_timeout_ms = 2000; +module_param(apst_secondary_timeout_ms, ulong, 0644); +MODULE_PARM_DESC(apst_secondary_timeout_ms, + "secondary APST timeout in ms"); + +static unsigned long apst_primary_latency_tol_us = 15000; +module_param(apst_primary_latency_tol_us, ulong, 0644); +MODULE_PARM_DESC(apst_primary_latency_tol_us, + "primary APST latency tolerance in us"); + +static unsigned long apst_secondary_latency_tol_us = 100000; +module_param(apst_secondary_latency_tol_us, ulong, 0644); +MODULE_PARM_DESC(apst_secondary_latency_tol_us, + "secondary APST latency tolerance in us"); + static bool streams; module_param(streams, bool, 0644); MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); @@ -589,6 +609,7 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { + nvme_req(req)->status = 0; nvme_req(req)->retries = 0; nvme_req(req)->flags = 0; req->rq_flags |= RQF_DONTPREP; @@ -611,6 +632,8 @@ static inline void nvme_init_request(struct request *req, cmd->common.flags &= ~NVME_CMD_SGL_ALL; req->cmd_flags |= REQ_FAILFAST_DRIVER; + if (req->mq_hctx->type == HCTX_TYPE_POLL) + req->cmd_flags |= REQ_HIPRI; nvme_clear_nvme_request(req); memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } @@ -701,9 +724,7 @@ EXPORT_SYMBOL_GPL(__nvme_check_ready); static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { - struct nvme_command c; - - memset(&c, 0, sizeof(c)); + struct nvme_command c = { }; c.directive.opcode = nvme_admin_directive_send; c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); @@ -728,9 +749,8 @@ static int nvme_enable_streams(struct nvme_ctrl *ctrl) static int nvme_get_stream_params(struct nvme_ctrl *ctrl, struct streams_directive_params *s, u32 nsid) { - struct nvme_command c; + struct nvme_command c = { }; - memset(&c, 0, sizeof(c)); memset(s, 0, sizeof(*s)); c.directive.opcode = nvme_admin_directive_recv; @@ -880,7 +900,10 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - cmnd->write_zeroes.control = 0; + if (nvme_ns_has_pi(ns)) + cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + else + cmnd->write_zeroes.control = 0; return BLK_STS_OK; } @@ -1012,29 +1035,23 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) } EXPORT_SYMBOL_GPL(nvme_setup_cmd); -static void nvme_end_sync_rq(struct request *rq, blk_status_t error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - complete(waiting); -} - -static void nvme_execute_rq_polled(struct request_queue *q, - struct gendisk *bd_disk, struct request *rq, int at_head) +/* + * Return values: + * 0: success + * >0: nvme controller's cqe status response + * <0: kernel error in lieu of controller response + */ +static int nvme_execute_rq(struct gendisk *disk, struct request *rq, + bool at_head) { - DECLARE_COMPLETION_ONSTACK(wait); - - WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); - - rq->cmd_flags |= REQ_HIPRI; - rq->end_io_data = &wait; - blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq); + blk_status_t status; - while (!completion_done(&wait)) { - blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); - cond_resched(); - } + status = blk_execute_rq(disk, rq, at_head); + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + return -EINTR; + if (nvme_req(rq)->status) + return nvme_req(rq)->status; + return blk_status_to_errno(status); } /* @@ -1044,7 +1061,7 @@ static void nvme_execute_rq_polled(struct request_queue *q, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags, bool poll) + blk_mq_req_flags_t flags) { struct request *req; int ret; @@ -1065,16 +1082,9 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - if (poll) - nvme_execute_rq_polled(req->q, NULL, req, at_head); - else - blk_execute_rq(NULL, req, at_head); - if (result) + ret = nvme_execute_rq(NULL, req, at_head); + if (result && ret >= 0) *result = nvme_req(req)->result; - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; out: blk_mq_free_request(req); return ret; @@ -1085,7 +1095,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, - NVME_QID_ANY, 0, 0, false); + NVME_QID_ANY, 0, 0); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -1162,18 +1172,21 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) } } -void nvme_execute_passthru_rq(struct request *rq) +int nvme_execute_passthru_rq(struct request *rq) { struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; struct nvme_ns *ns = rq->q->queuedata; struct gendisk *disk = ns ? ns->disk : NULL; u32 effects; + int ret; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(disk, rq, 0); + ret = nvme_execute_rq(disk, rq, false); if (effects) /* nothing to be done for zero cmd effects */ nvme_passthru_end(ctrl, effects); + + return ret; } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); @@ -1440,16 +1453,15 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, u32 *result) { union nvme_result res = { 0 }; - struct nvme_command c; + struct nvme_command c = { }; int ret; - memset(&c, 0, sizeof(c)); c.features.opcode = op; c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); + buffer, buflen, 0, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; @@ -1522,36 +1534,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); } -/* - * Issue ioctl requests on the first available path. Note that unlike normal - * block layer requests we will not retry failed request on another controller. - */ -struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, - struct nvme_ns_head **head, int *srcu_idx) -{ -#ifdef CONFIG_NVME_MULTIPATH - if (disk->fops == &nvme_ns_head_ops) { - struct nvme_ns *ns; - - *head = disk->private_data; - *srcu_idx = srcu_read_lock(&(*head)->srcu); - ns = nvme_find_path(*head); - if (!ns) - srcu_read_unlock(&(*head)->srcu, *srcu_idx); - return ns; - } -#endif - *head = NULL; - *srcu_idx = -1; - return disk->private_data; -} - -void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) -{ - if (head) - srcu_read_unlock(&head->srcu, idx); -} - static int nvme_ns_open(struct nvme_ns *ns) { @@ -1601,9 +1583,8 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, u32 max_integrity_segments) { - struct blk_integrity integrity; + struct blk_integrity integrity = { }; - memset(&integrity, 0, sizeof(integrity)); switch (pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; @@ -1948,30 +1929,45 @@ static char nvme_pr_type(enum pr_type type) } }; +static int nvme_send_ns_head_pr_command(struct block_device *bdev, + struct nvme_command *c, u8 data[16]) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) { + c->common.nsid = cpu_to_le32(ns->head->ns_id); + ret = nvme_submit_sync_cmd(ns->queue, c, data, 16); + } + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c, + u8 data[16]) +{ + c->common.nsid = cpu_to_le32(ns->head->ns_id); + return nvme_submit_sync_cmd(ns->queue, c, data, 16); +} + static int nvme_pr_command(struct block_device *bdev, u32 cdw10, u64 key, u64 sa_key, u8 op) { - struct nvme_ns_head *head = NULL; - struct nvme_ns *ns; - struct nvme_command c; - int srcu_idx, ret; + struct nvme_command c = { }; u8 data[16] = { 0, }; - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - put_unaligned_le64(key, &data[0]); put_unaligned_le64(sa_key, &data[8]); - memset(&c, 0, sizeof(c)); c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->head->ns_id); c.common.cdw10 = cpu_to_le32(cdw10); - ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); - nvme_put_ns_from_disk(head, srcu_idx); - return ret; + if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && + bdev->bd_disk->fops == &nvme_ns_head_ops) + return nvme_send_ns_head_pr_command(bdev, &c, data); + return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data); } static int nvme_pr_register(struct block_device *bdev, u64 old, @@ -2036,9 +2032,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct nvme_ctrl *ctrl = data; - struct nvme_command cmd; + struct nvme_command cmd = { }; - memset(&cmd, 0, sizeof(cmd)); if (send) cmd.common.opcode = nvme_admin_security_send; else @@ -2048,11 +2043,22 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, cmd.common.cdw11 = cpu_to_le32(len); return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, - NVME_QID_ANY, 1, 0, false); + NVME_QID_ANY, 1, 0); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ +#ifdef CONFIG_BLK_DEV_ZONED +static int nvme_report_zones(struct gendisk *disk, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb, + data); +} +#else +#define nvme_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ + static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -2218,13 +2224,53 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl) } /* + * The function checks whether the given total (exlat + enlat) latency of + * a power state allows the latter to be used as an APST transition target. + * It does so by comparing the latency to the primary and secondary latency + * tolerances defined by module params. If there's a match, the corresponding + * timeout value is returned and the matching tolerance index (1 or 2) is + * reported. + */ +static bool nvme_apst_get_transition_time(u64 total_latency, + u64 *transition_time, unsigned *last_index) +{ + if (total_latency <= apst_primary_latency_tol_us) { + if (*last_index == 1) + return false; + *last_index = 1; + *transition_time = apst_primary_timeout_ms; + return true; + } + if (apst_secondary_timeout_ms && + total_latency <= apst_secondary_latency_tol_us) { + if (*last_index <= 2) + return false; + *last_index = 2; + *transition_time = apst_secondary_timeout_ms; + return true; + } + return false; +} + +/* * APST (Autonomous Power State Transition) lets us program a table of power * state transitions that the controller will perform automatically. - * We configure it with a simple heuristic: we are willing to spend at most 2% - * of the time transitioning between power states. Therefore, when running in - * any given state, we will enter the next lower-power non-operational state - * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit - * latency is under the requested maximum latency. + * + * Depending on module params, one of the two supported techniques will be used: + * + * - If the parameters provide explicit timeouts and tolerances, they will be + * used to build a table with up to 2 non-operational states to transition to. + * The default parameter values were selected based on the values used by + * Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic + * regeneration of the APST table in the event of switching between external + * and battery power, the timeouts and tolerances reflect a compromise + * between values used by Microsoft for AC and battery scenarios. + * - If not, we'll configure the table with a simple heuristic: we are willing + * to spend at most 2% of the time transitioning between power states. + * Therefore, when running in any given state, we will enter the next + * lower-power non-operational state after waiting 50 * (enlat + exlat) + * microseconds, as long as that state's exit latency is under the requested + * maximum latency. * * We will not autonomously enter any non-operational state for which the total * latency exceeds ps_max_latency_us. @@ -2240,6 +2286,7 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) int max_ps = -1; int state; int ret; + unsigned last_lt_index = UINT_MAX; /* * If APST isn't supported or if we haven't been initialized yet, @@ -2298,13 +2345,19 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) le32_to_cpu(ctrl->psd[state].entry_lat); /* - * This state is good. Use it as the APST idle target for - * higher power states. + * This state is good. It can be used as the APST idle target + * for higher power states. */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; + if (apst_primary_timeout_ms && apst_primary_latency_tol_us) { + if (!nvme_apst_get_transition_time(total_latency_us, + &transition_ms, &last_lt_index)) + continue; + } else { + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + } target = cpu_to_le64((state << 3) | (transition_ms << 8)); if (max_ps == -1) @@ -3485,8 +3538,10 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, cdev_init(cdev, fops); cdev->owner = owner; ret = cdev_device_add(cdev, cdev_device); - if (ret) + if (ret) { + put_device(cdev_device); ida_simple_remove(&nvme_ns_chr_minor_ida, minor); + } return ret; } @@ -3699,7 +3754,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the @@ -3756,6 +3810,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, static void nvme_ns_remove(struct nvme_ns *ns) { + bool last_path = false; + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; @@ -3764,8 +3820,6 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_lock(&ns->ctrl->subsys->lock); list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) - list_del_init(&ns->head->entry); mutex_unlock(&ns->ctrl->subsys->lock); synchronize_rcu(); /* guarantee not available in head->list */ @@ -3785,7 +3839,15 @@ static void nvme_ns_remove(struct nvme_ns *ns) list_del_init(&ns->list); up_write(&ns->ctrl->namespaces_rwsem); - nvme_mpath_check_last_path(ns); + /* Synchronize with nvme_init_ns_head() */ + mutex_lock(&ns->head->subsys->lock); + if (list_empty(&ns->head->list)) { + list_del_init(&ns->head->entry); + last_path = true; + } + mutex_unlock(&ns->head->subsys->lock); + if (last_path) + nvme_mpath_shutdown_disk(ns->head); nvme_put_ns(ns); } @@ -4067,6 +4129,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", opts->host_traddr ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_IFACE=%s", + opts->host_iface ?: "none"); } return ret; } |