diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
-rw-r--r-- | drivers/nvme/host/core.c | 361 |
1 files changed, 217 insertions, 144 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88b54cdcbd68..85ab0fcf9e88 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -20,6 +20,7 @@ #include <linux/ptrace.h> #include <linux/nvme_ioctl.h> #include <linux/pm_qos.h> +#include <linux/ratelimit.h> #include <asm/unaligned.h> #include "nvme.h" @@ -131,7 +132,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl) /* * Only new queue scan work when admin and IO queues are both alive */ - if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset) queue_work(nvme_wq, &ctrl->scan_work); } @@ -143,7 +144,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl) */ int nvme_try_sched_reset(struct nvme_ctrl *ctrl) { - if (ctrl->state != NVME_CTRL_RESETTING) + if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING) return -EBUSY; if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) return -EBUSY; @@ -156,7 +157,7 @@ static void nvme_failfast_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), struct nvme_ctrl, failfast_work); - if (ctrl->state != NVME_CTRL_CONNECTING) + if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING) return; set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); @@ -200,7 +201,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) ret = nvme_reset_ctrl(ctrl); if (!ret) { flush_work(&ctrl->reset_work); - if (ctrl->state != NVME_CTRL_LIVE) + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) ret = -ENETRESET; } @@ -312,12 +313,12 @@ static void nvme_log_error(struct request *req) struct nvme_request *nr = nvme_req(req); if (ns) { - pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n", + pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n", ns->disk ? ns->disk->disk_name : "?", nvme_get_opcode_str(nr->cmd->common.opcode), nr->cmd->common.opcode, - (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)), - (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift, + nvme_sect_to_lba(ns->head, blk_rq_pos(req)), + blk_rq_bytes(req) >> ns->head->lba_shift, nvme_get_error_status_str(nr->status), nr->status >> 8 & 7, /* Status Code Type */ nr->status & 0xff, /* Status Code */ @@ -372,9 +373,12 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) static inline void nvme_end_req_zoned(struct request *req) { if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = nvme_lba_to_sect(req->q->queuedata, + req_op(req) == REQ_OP_ZONE_APPEND) { + struct nvme_ns *ns = req->q->queuedata; + + req->__sector = nvme_lba_to_sect(ns->head, le64_to_cpu(nvme_req(req)->result.u64)); + } } static inline void nvme_end_req(struct request *req) @@ -482,7 +486,6 @@ EXPORT_SYMBOL_GPL(nvme_cancel_tagset); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) { - nvme_stop_keep_alive(ctrl); if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); @@ -500,7 +503,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, spin_lock_irqsave(&ctrl->lock, flags); - old_state = ctrl->state; + old_state = nvme_ctrl_state(ctrl); switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { @@ -568,7 +571,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } if (changed) { - ctrl->state = new_state; + WRITE_ONCE(ctrl->state, new_state); wake_up_all(&ctrl->state_wq); } @@ -576,11 +579,11 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, if (!changed) return false; - if (ctrl->state == NVME_CTRL_LIVE) { + if (new_state == NVME_CTRL_LIVE) { if (old_state == NVME_CTRL_CONNECTING) nvme_stop_failfast_work(ctrl); nvme_kick_requeue_lists(ctrl); - } else if (ctrl->state == NVME_CTRL_CONNECTING && + } else if (new_state == NVME_CTRL_CONNECTING && old_state == NVME_CTRL_RESETTING) { nvme_start_failfast_work(ctrl); } @@ -593,7 +596,7 @@ EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); */ static bool nvme_state_terminal(struct nvme_ctrl *ctrl) { - switch (ctrl->state) { + switch (nvme_ctrl_state(ctrl)) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: @@ -618,7 +621,7 @@ bool nvme_wait_reset(struct nvme_ctrl *ctrl) wait_event(ctrl->state_wq, nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || nvme_state_terminal(ctrl)); - return ctrl->state == NVME_CTRL_RESETTING; + return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING; } EXPORT_SYMBOL_GPL(nvme_wait_reset); @@ -705,9 +708,11 @@ EXPORT_SYMBOL_GPL(nvme_init_request); blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, struct request *rq) { - if (ctrl->state != NVME_CTRL_DELETING_NOIO && - ctrl->state != NVME_CTRL_DELETING && - ctrl->state != NVME_CTRL_DEAD && + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + if (state != NVME_CTRL_DELETING_NOIO && + state != NVME_CTRL_DELETING && + state != NVME_CTRL_DEAD && !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; @@ -737,7 +742,7 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, * command, which is require to set the queue live in the * appropinquate states. */ - switch (ctrl->state) { + switch (nvme_ctrl_state(ctrl)) { case NVME_CTRL_CONNECTING: if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && (req->cmd->fabrics.fctype == nvme_fabrics_type_connect || @@ -792,8 +797,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, } if (queue_max_discard_segments(req->q) == 1) { - u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req)); - u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9); + u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req)); + u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9); range[0].cattr = cpu_to_le32(0); range[0].nlb = cpu_to_le32(nlb); @@ -801,8 +806,9 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, n = 1; } else { __rq_for_each_bio(bio, req) { - u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + u64 slba = nvme_sect_to_lba(ns->head, + bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift; if (n < segments) { range[n].cattr = cpu_to_le32(0); @@ -840,7 +846,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd, u64 ref48; /* both rw and write zeroes share the same reftag format */ - switch (ns->guard_type) { + switch (ns->head->guard_type) { case NVME_NVM_NS_16B_GUARD: cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; @@ -868,17 +874,18 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); cmnd->write_zeroes.slba = - cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); + cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req))); cmnd->write_zeroes.length = - cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); - if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC)) + if (!(req->cmd_flags & REQ_NOUNMAP) && + (ns->head->features & NVME_NS_DEAC)) cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC); - if (nvme_ns_has_pi(ns)) { + if (nvme_ns_has_pi(ns->head)) { cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT); - switch (ns->pi_type) { + switch (ns->head->pi_type) { case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: nvme_set_ref_tag(ns, cmnd, req); @@ -910,13 +917,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, cmnd->rw.cdw2 = 0; cmnd->rw.cdw3 = 0; cmnd->rw.metadata = 0; - cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + cmnd->rw.slba = + cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req))); + cmnd->rw.length = + cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); cmnd->rw.reftag = 0; cmnd->rw.apptag = 0; cmnd->rw.appmask = 0; - if (ns->ms) { + if (ns->head->ms) { /* * If formated with metadata, the block layer always provides a * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else @@ -924,12 +933,12 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, * namespace capacity to zero to prevent any I/O. */ if (!blk_integrity_rq(req)) { - if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) + if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; } - switch (ns->pi_type) { + switch (ns->head->pi_type) { case NVME_NS_DPS_PI_TYPE3: control |= NVME_RW_PRINFO_PRCHK_GUARD; break; @@ -1193,8 +1202,16 @@ static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl) static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) { - queue_delayed_work(nvme_wq, &ctrl->ka_work, - nvme_keep_alive_work_period(ctrl)); + unsigned long now = jiffies; + unsigned long delay = nvme_keep_alive_work_period(ctrl); + unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay; + + if (time_after(now, ka_next_check_tm)) + delay = 0; + else + delay = ka_next_check_tm - now; + + queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, @@ -1443,7 +1460,7 @@ free_data: return status; } -static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, +int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns **id) { struct nvme_command c = { }; @@ -1480,7 +1497,8 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, if (id->ncap == 0) { /* namespace not allocated or attached */ info->is_removed = true; - return -ENODEV; + ret = -ENODEV; + goto error; } info->anagrpid = id->anagrpid; @@ -1498,8 +1516,10 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) memcpy(ids->nguid, id->nguid, sizeof(ids->nguid)); } + +error: kfree(id); - return 0; + return ret; } static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl, @@ -1659,14 +1679,14 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, - u32 max_integrity_segments) +static void nvme_init_integrity(struct gendisk *disk, + struct nvme_ns_head *head, u32 max_integrity_segments) { struct blk_integrity integrity = { }; - switch (ns->pi_type) { + switch (head->pi_type) { case NVME_NS_DPS_PI_TYPE3: - switch (ns->guard_type) { + switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: integrity.profile = &t10_pi_type3_crc; integrity.tag_size = sizeof(u16) + sizeof(u32); @@ -1684,7 +1704,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, break; case NVME_NS_DPS_PI_TYPE1: case NVME_NS_DPS_PI_TYPE2: - switch (ns->guard_type) { + switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: integrity.profile = &t10_pi_type1_crc; integrity.tag_size = sizeof(u16); @@ -1705,27 +1725,28 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, break; } - integrity.tuple_size = ns->ms; + integrity.tuple_size = head->ms; blk_integrity_register(disk, &integrity); blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); } #else -static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns, - u32 max_integrity_segments) +static void nvme_init_integrity(struct gendisk *disk, + struct nvme_ns_head *head, u32 max_integrity_segments) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) +static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, + struct nvme_ns_head *head) { - struct nvme_ctrl *ctrl = ns->ctrl; struct request_queue *queue = disk->queue; - u32 size = queue_logical_block_size(queue); + u32 max_discard_sectors; - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX)) - ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl); - - if (ctrl->max_discard_sectors == 0) { + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { + max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); + } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + max_discard_sectors = UINT_MAX; + } else { blk_queue_max_discard_sectors(queue, 0); return; } @@ -1733,14 +1754,22 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - queue->limits.discard_granularity = size; - - /* If discard is already enabled, don't reset queue limits */ + /* + * If discard is already enabled, don't reset queue limits. + * + * This works around the fact that the block layer can't cope well with + * updating the hardware limits when overridden through sysfs. This is + * harmless because discard limits in NVMe are purely advisory. + */ if (queue->limits.max_discard_sectors) return; - blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); - blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + blk_queue_max_discard_sectors(queue, max_discard_sectors); + if (ctrl->dmrl) + blk_queue_max_discard_segments(queue, ctrl->dmrl); + else + blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + queue->limits.discard_granularity = queue_logical_block_size(queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); @@ -1754,21 +1783,21 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) a->csi == b->csi; } -static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id) +static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, + struct nvme_id_ns *id) { bool first = id->dps & NVME_NS_DPS_PI_FIRST; unsigned lbaf = nvme_lbaf_index(id->flbas); - struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_command c = { }; struct nvme_id_ns_nvm *nvm; int ret = 0; u32 elbaf; - ns->pi_size = 0; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + head->pi_size = 0; + head->ms = le16_to_cpu(id->lbaf[lbaf].ms); if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { - ns->pi_size = sizeof(struct t10_pi_tuple); - ns->guard_type = NVME_NVM_NS_16B_GUARD; + head->pi_size = sizeof(struct t10_pi_tuple); + head->guard_type = NVME_NVM_NS_16B_GUARD; goto set_pi; } @@ -1777,11 +1806,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id) return -ENOMEM; c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(ns->head->ns_id); + c.identify.nsid = cpu_to_le32(head->ns_id); c.identify.cns = NVME_ID_CNS_CS_NS; c.identify.csi = NVME_CSI_NVM; - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm)); + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); if (ret) goto free_data; @@ -1791,13 +1820,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id) if (nvme_elbaf_sts(elbaf)) goto free_data; - ns->guard_type = nvme_elbaf_guard_type(elbaf); - switch (ns->guard_type) { + head->guard_type = nvme_elbaf_guard_type(elbaf); + switch (head->guard_type) { case NVME_NVM_NS_64B_GUARD: - ns->pi_size = sizeof(struct crc64_pi_tuple); + head->pi_size = sizeof(struct crc64_pi_tuple); break; case NVME_NVM_NS_16B_GUARD: - ns->pi_size = sizeof(struct t10_pi_tuple); + head->pi_size = sizeof(struct t10_pi_tuple); break; default: break; @@ -1806,24 +1835,26 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id) free_data: kfree(nvm); set_pi: - if (ns->pi_size && (first || ns->ms == ns->pi_size)) - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + if (head->pi_size && (first || head->ms == head->pi_size)) + head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; else - ns->pi_type = 0; + head->pi_type = 0; return ret; } -static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +static int nvme_configure_metadata(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head, struct nvme_id_ns *id) { - struct nvme_ctrl *ctrl = ns->ctrl; + int ret; - if (nvme_init_ms(ns, id)) - return; + ret = nvme_init_ms(ctrl, head, id); + if (ret) + return ret; - ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); - if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return; + head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + return 0; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -1832,9 +1863,9 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return; + return 0; - ns->features |= NVME_NS_EXT_LBAS; + head->features |= NVME_NS_EXT_LBAS; /* * The current fabrics transport drivers support namespace @@ -1845,8 +1876,8 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * Note, this check will need to be modified if any drivers * gain the ability to use other metadata formats. */ - if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns)) - ns->features |= NVME_NS_METADATA_SUPPORTED; + if (ctrl->max_integrity_segments && nvme_ns_has_pi(head)) + head->features |= NVME_NS_METADATA_SUPPORTED; } else { /* * For PCIe controllers, we can't easily remap the separate @@ -1855,10 +1886,11 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * We allow extended LBAs for the passthrough interface, though. */ if (id->flbas & NVME_NS_FLBAS_META_EXT) - ns->features |= NVME_NS_EXT_LBAS; + head->features |= NVME_NS_EXT_LBAS; else - ns->features |= NVME_NS_METADATA_SUPPORTED; + head->features |= NVME_NS_METADATA_SUPPORTED; } + return 0; } static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, @@ -1879,18 +1911,19 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, blk_queue_write_cache(q, vwc, vwc); } -static void nvme_update_disk_info(struct gendisk *disk, - struct nvme_ns *ns, struct nvme_id_ns *id) +static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, + struct nvme_ns_head *head, struct nvme_id_ns *id) { - sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); - u32 bs = 1U << ns->lba_shift; + sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze)); + u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; /* * The block layer can't support LBA sizes larger than the page size - * yet, so catch this early and don't allow block I/O. + * or smaller than a sector size yet, so catch this early and don't + * allow block I/O. */ - if (ns->lba_shift > PAGE_SHIFT) { + if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { capacity = 0; bs = (1 << 9); } @@ -1907,7 +1940,7 @@ static void nvme_update_disk_info(struct gendisk *disk, if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else - atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + atomic_bs = (1 + ctrl->subsys->awupf) * bs; } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { @@ -1933,20 +1966,20 @@ static void nvme_update_disk_info(struct gendisk *disk, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (ns->ms) { + if (head->ms) { if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - (ns->features & NVME_NS_METADATA_SUPPORTED)) - nvme_init_integrity(disk, ns, - ns->ctrl->max_integrity_segments); - else if (!nvme_ns_has_pi(ns)) + (head->features & NVME_NS_METADATA_SUPPORTED)) + nvme_init_integrity(disk, head, + ctrl->max_integrity_segments); + else if (!nvme_ns_has_pi(head)) capacity = 0; } set_capacity_and_notify(disk, capacity); - nvme_config_discard(disk, ns); + nvme_config_discard(ctrl, disk, head); blk_queue_max_write_zeroes_sectors(disk->queue, - ns->ctrl->max_zeroes_sectors); + ctrl->max_zeroes_sectors); } static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) @@ -1969,7 +2002,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; else - iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); + iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob)); if (!iob) return; @@ -2027,14 +2060,26 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (ret) return ret; + if (id->ncap == 0) { + /* namespace not allocated or attached */ + info->is_removed = true; + ret = -ENODEV; + goto error; + } + blk_mq_freeze_queue(ns->disk->queue); lbaf = nvme_lbaf_index(id->flbas); - ns->lba_shift = id->lbaf[lbaf].ds; + ns->head->lba_shift = id->lbaf[lbaf].ds; + ns->head->nuse = le64_to_cpu(id->nuse); nvme_set_queue_limits(ns->ctrl, ns->queue); - nvme_configure_metadata(ns, id); + ret = nvme_configure_metadata(ns->ctrl, ns->head, id); + if (ret < 0) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->disk, ns, id); + nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id); if (ns->head->ids.csi == NVME_CSI_ZNS) { ret = nvme_update_zone_info(ns, lbaf); @@ -2051,7 +2096,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * do not return zeroes. */ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) - ns->features |= NVME_NS_DEAC; + ns->head->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); @@ -2064,7 +2109,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); - nvme_update_disk_info(ns->head->disk, ns, id); + nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id); set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); blk_stack_limits(&ns->head->disk->queue->limits, @@ -2084,6 +2129,8 @@ out: set_bit(NVME_NS_READY, &ns->flags); ret = 0; } + +error: kfree(id); return ret; } @@ -2523,7 +2570,7 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val) if (ctrl->ps_max_latency_us != latency) { ctrl->ps_max_latency_us = latency; - if (ctrl->state == NVME_CTRL_LIVE) + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE) nvme_configure_apst(ctrl); } } @@ -2891,14 +2938,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) struct nvme_id_ctrl_nvm *id; int ret; - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { - ctrl->max_discard_sectors = UINT_MAX; - ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; - } else { - ctrl->max_discard_sectors = 0; - ctrl->max_discard_segments = 0; - } - /* * Even though NVMe spec explicitly states that MDTS is not applicable * to the write-zeroes, we are cautious and limit the size to the @@ -2928,8 +2967,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) if (ret) goto free_data; - if (id->dmrl) - ctrl->max_discard_segments = id->dmrl; + ctrl->dmrl = id->dmrl; ctrl->dmrsl = le32_to_cpu(id->dmrsl); if (id->wzsl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); @@ -2997,6 +3035,42 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) return 0; } +static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + /* + * In fabrics we need to verify the cntlid matches the + * admin connect + */ + if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { + dev_err(ctrl->device, + "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n", + ctrl->cntlid, le16_to_cpu(id->cntlid)); + return -EINVAL; + } + + if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { + dev_err(ctrl->device, + "keep-alive support is mandatory for fabrics\n"); + return -EINVAL; + } + + if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) { + dev_err(ctrl->device, + "I/O queue command capsule supported size %d < 4\n", + ctrl->ioccsz); + return -EINVAL; + } + + if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) { + dev_err(ctrl->device, + "I/O queue response capsule supported size %d < 1\n", + ctrl->iorcsz); + return -EINVAL; + } + + return 0; +} + static int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; @@ -3109,25 +3183,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->iorcsz = le32_to_cpu(id->iorcsz); ctrl->maxcmd = le16_to_cpu(id->maxcmd); - /* - * In fabrics we need to verify the cntlid matches the - * admin connect - */ - if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { - dev_err(ctrl->device, - "Mismatching cntlid: Connect %u vs Identify " - "%u, rejecting\n", - ctrl->cntlid, le16_to_cpu(id->cntlid)); - ret = -EINVAL; - goto out_free; - } - - if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { - dev_err(ctrl->device, - "keep-alive support is mandatory for fabrics\n"); - ret = -EINVAL; + ret = nvme_check_ctrl_fabric_info(ctrl, id); + if (ret) goto out_free; - } } else { ctrl->hmpre = le32_to_cpu(id->hmpre); ctrl->hmmin = le32_to_cpu(id->hmmin); @@ -3211,7 +3269,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file) struct nvme_ctrl *ctrl = container_of(inode->i_cdev, struct nvme_ctrl, cdev); - switch (ctrl->state) { + switch (nvme_ctrl_state(ctrl)) { case NVME_CTRL_LIVE: break; default: @@ -3386,6 +3444,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = info->nsid; head->ids = info->ids; head->shared = info->is_shared; + ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1); + ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE); kref_init(&head->ref); if (head->ids.csi) { @@ -3633,11 +3693,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) goto out_unlink_ns; down_write(&ctrl->namespaces_rwsem); + /* + * Ensure that no namespaces are added to the ctrl list after the queues + * are frozen, thereby avoiding a deadlock between scan and reset. + */ + if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { + up_write(&ctrl->namespaces_rwsem); + goto out_unlink_ns; + } nvme_ns_add_to_ctrl_list(ns); up_write(&ctrl->namespaces_rwsem); nvme_get_ctrl(ctrl); - if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) + if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) goto out_cleanup_ns_from_list; if (!nvme_ns_head_multipath(ns->head)) @@ -3897,7 +3965,7 @@ static void nvme_scan_work(struct work_struct *work) int ret; /* No tagset on a live ctrl means IO queues could not created */ - if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset) return; /* @@ -3967,7 +4035,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) * removing the namespaces' disks; fail all the queues now to avoid * potentially having to clean up the failed sync later. */ - if (ctrl->state == NVME_CTRL_DEAD) + if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD) nvme_mark_namespaces_dead(ctrl); /* this is a no-op when called from the controller reset handler */ @@ -4049,7 +4117,7 @@ static void nvme_async_event_work(struct work_struct *work) * flushing ctrl async_event_work after changing the controller state * from LIVE and before freeing the admin queue. */ - if (ctrl->state == NVME_CTRL_LIVE) + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE) ctrl->ops->submit_async_event(ctrl); } @@ -4100,6 +4168,8 @@ static void nvme_fw_act_work(struct work_struct *work) struct nvme_ctrl, fw_act_work); unsigned long fw_act_timeout; + nvme_auth_stop(ctrl); + if (ctrl->mtfa) fw_act_timeout = jiffies + msecs_to_jiffies(ctrl->mtfa * 100); @@ -4155,7 +4225,6 @@ static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) * firmware activation. */ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { - nvme_auth_stop(ctrl); requeue = false; queue_work(nvme_wq, &ctrl->fw_act_work); } @@ -4348,6 +4417,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_auth_stop(ctrl); + nvme_stop_keep_alive(ctrl); nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); @@ -4443,7 +4513,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, { int ret; - ctrl->state = NVME_CTRL_NEW; + WRITE_ONCE(ctrl->state, NVME_CTRL_NEW); clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); @@ -4464,6 +4534,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + ctrl->ka_last_check_time = jiffies; BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > PAGE_SIZE); @@ -4552,6 +4623,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_mq_unfreeze_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); + clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_unfreeze); @@ -4585,6 +4657,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + set_bit(NVME_CTRL_FROZEN, &ctrl->flags); down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) blk_freeze_queue_start(ns->queue); |