aboutsummaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/apple.c4
-rw-r--r--drivers/nvme/host/core.c84
-rw-r--r--drivers/nvme/host/ioctl.c33
-rw-r--r--drivers/nvme/host/multipath.c23
-rw-r--r--drivers/nvme/host/nvme.h16
-rw-r--r--drivers/nvme/host/pci.c279
-rw-r--r--drivers/nvme/host/pr.c122
-rw-r--r--drivers/nvme/host/rdma.c12
-rw-r--r--drivers/nvme/host/tcp.c35
-rw-r--r--drivers/nvme/host/trace.c58
-rw-r--r--drivers/nvme/host/zns.c2
11 files changed, 468 insertions, 200 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1387dc459a3..4319ab50c10d 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -649,7 +649,7 @@ static bool apple_nvme_handle_cq(struct apple_nvme_queue *q, bool force)
found = apple_nvme_poll_cq(q, &iob);
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
apple_nvme_complete_batch(&iob);
return found;
@@ -1618,7 +1618,7 @@ static struct platform_driver apple_nvme_driver = {
.pm = pm_sleep_ptr(&apple_nvme_pm_ops),
},
.probe = apple_nvme_probe,
- .remove_new = apple_nvme_remove,
+ .remove = apple_nvme_remove,
.shutdown = apple_nvme_shutdown,
};
module_platform_driver(apple_nvme_driver);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 855b42c92284..a970168a3014 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -42,6 +42,8 @@ struct nvme_ns_info {
bool is_readonly;
bool is_ready;
bool is_removed;
+ bool is_rotational;
+ bool no_vwc;
};
unsigned int admin_timeout = 60;
@@ -700,7 +702,7 @@ void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
-EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, "NVME_TARGET_PASSTHRU");
static inline void nvme_clear_nvme_request(struct request *req)
{
@@ -1121,7 +1123,7 @@ int nvme_execute_rq(struct request *rq, bool at_head)
return nvme_req(rq)->status;
return blk_status_to_errno(status);
}
-EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, "NVME_TARGET_PASSTHRU");
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
@@ -1201,7 +1203,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
return effects;
}
-EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, "NVME_TARGET_PASSTHRU");
u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
@@ -1221,7 +1223,7 @@ u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
}
return effects;
}
-EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, "NVME_TARGET_PASSTHRU");
void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
struct nvme_command *cmd, int status)
@@ -1266,7 +1268,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
break;
}
}
-EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, "NVME_TARGET_PASSTHRU");
/*
* Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
@@ -1303,9 +1305,10 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
}
-static void nvme_keep_alive_finish(struct request *rq,
- blk_status_t status, struct nvme_ctrl *ctrl)
+static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
+ blk_status_t status)
{
+ struct nvme_ctrl *ctrl = rq->end_io_data;
unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
unsigned long delay = nvme_keep_alive_work_period(ctrl);
enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
@@ -1322,17 +1325,20 @@ static void nvme_keep_alive_finish(struct request *rq,
delay = 0;
}
+ blk_mq_free_request(rq);
+
if (status) {
dev_err(ctrl->device,
"failed nvme_keep_alive_end_io error=%d\n",
status);
- return;
+ return RQ_END_IO_NONE;
}
ctrl->ka_last_check_time = jiffies;
ctrl->comp_seen = false;
if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING)
queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
+ return RQ_END_IO_NONE;
}
static void nvme_keep_alive_work(struct work_struct *work)
@@ -1341,7 +1347,6 @@ static void nvme_keep_alive_work(struct work_struct *work)
struct nvme_ctrl, ka_work);
bool comp_seen = ctrl->comp_seen;
struct request *rq;
- blk_status_t status;
ctrl->ka_last_check_time = jiffies;
@@ -1364,9 +1369,9 @@ static void nvme_keep_alive_work(struct work_struct *work)
nvme_init_request(rq, &ctrl->ka_cmd);
rq->timeout = ctrl->kato * HZ;
- status = blk_execute_rq(rq, false);
- nvme_keep_alive_finish(rq, status, ctrl);
- blk_mq_free_request(rq);
+ rq->end_io = nvme_keep_alive_end_io;
+ rq->end_io_data = ctrl;
+ blk_execute_rq_nowait(rq, false);
}
static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
@@ -1639,6 +1644,8 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
+ info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
+ info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
}
kfree(id);
return ret;
@@ -2027,7 +2034,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
* or smaller than a sector size yet, so catch this early and don't
* allow block I/O.
*/
- if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
+ if (blk_validate_block_size(bs)) {
bs = (1 << 9);
valid = false;
}
@@ -2064,7 +2071,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
lim->physical_block_size = min(phys_bs, atomic_bs);
lim->io_min = phys_bs;
lim->io_opt = io_opt;
- if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
+ (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
lim->max_write_zeroes_sectors = UINT_MAX;
else
lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
@@ -2185,11 +2193,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
ns->head->ids.csi == NVME_CSI_ZNS)
nvme_update_zone_info(ns, &lim, &zi);
- if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
else
lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+ if (info->is_rotational)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
@@ -3250,8 +3261,9 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
}
if (!ctrl->maxcmd) {
- dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
- return -EINVAL;
+ dev_warn(ctrl->device,
+ "Firmware bug: maximum outstanding commands is 0\n");
+ ctrl->maxcmd = ctrl->sqsize + 1;
}
return 0;
@@ -3636,6 +3648,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ head->rotational = info->is_rotational;
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
@@ -3809,7 +3822,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
-EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, "NVME_TARGET_PASSTHRU");
/*
* Add the namespace to the controller list while keeping the list ordered.
@@ -4017,7 +4030,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns_info info = { .nsid = nsid };
struct nvme_ns *ns;
- int ret;
+ int ret = 1;
if (nvme_identify_ns_descs(ctrl, &info))
return;
@@ -4034,9 +4047,10 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
* set up a namespace. If not fall back to the legacy version.
*/
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
- (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
+ (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
+ ctrl->vs >= NVME_VS(2, 0, 0))
ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
- else
+ if (ret > 0)
ret = nvme_ns_info_from_identify(ctrl, &info);
if (info.is_removed)
@@ -4591,6 +4605,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
+ /*
+ * As we're about to destroy the queue and free tagset
+ * we can not have keep-alive work running.
+ */
+ nvme_stop_keep_alive(ctrl);
blk_mq_destroy_queue(ctrl->admin_q);
blk_put_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -4895,7 +4914,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu))
- blk_mq_unfreeze_queue(ns->queue);
+ blk_mq_unfreeze_queue_non_owner(ns->queue);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
@@ -4940,7 +4959,12 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu))
- blk_freeze_queue_start(ns->queue);
+ /*
+ * Typical non_owner use case is from pci driver, in which
+ * start_freeze is called from timeout work function, but
+ * unfreeze is done in reset work context
+ */
+ blk_freeze_queue_start_non_owner(ns->queue);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
@@ -5008,7 +5032,7 @@ struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
return NULL;
return file->private_data;
}
-EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, "NVME_TARGET_PASSTHRU");
/*
* Check we didn't inadvertently grow the command structure sizes:
@@ -5036,6 +5060,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
@@ -5044,22 +5070,20 @@ static inline void _nvme_check_size(void)
static int __init nvme_core_init(void)
{
+ unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
int result = -ENOMEM;
_nvme_check_size();
- nvme_wq = alloc_workqueue("nvme-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
if (!nvme_wq)
goto out;
- nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
if (!nvme_reset_wq)
goto destroy_wq;
- nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
if (!nvme_delete_wq)
goto destroy_reset_wq;
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a96976b22fa7..e8930146847a 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -114,18 +114,26 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
static int nvme_map_user_request(struct request *req, u64 ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
+ struct io_uring_cmd *ioucmd, unsigned int flags)
{
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
+ struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
int ret;
- if (has_metadata && !supports_metadata)
- return -EINVAL;
+ if (!nvme_ctrl_sgl_supported(ctrl))
+ dev_warn_once(ctrl->device, "using unchecked data buffer\n");
+ if (has_metadata) {
+ if (!supports_metadata)
+ return -EINVAL;
+ if (!nvme_ctrl_meta_sgl_supported(ctrl))
+ dev_warn_once(ctrl->device,
+ "using unchecked metadata buffer\n");
+ }
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
struct iov_iter iter;
@@ -152,8 +160,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
bio_set_dev(bio, bdev);
if (has_metadata) {
- ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len,
- meta_seed);
+ ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
if (ret)
goto out_unmap;
}
@@ -170,7 +177,7 @@ out:
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, u64 ubuffer, unsigned bufflen,
- void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+ void __user *meta_buffer, unsigned meta_len,
u64 *result, unsigned timeout, unsigned int flags)
{
struct nvme_ns *ns = q->queuedata;
@@ -187,7 +194,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
req->timeout = timeout;
if (ubuffer && bufflen) {
ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
- meta_len, meta_seed, NULL, flags);
+ meta_len, NULL, flags);
if (ret)
return ret;
}
@@ -268,7 +275,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.lbatm = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
- meta_len, lower_32_bits(io.slba), NULL, 0, 0);
+ meta_len, NULL, 0, 0);
}
static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -323,7 +330,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
- cmd.metadata_len, 0, &result, timeout, 0);
+ cmd.metadata_len, &result, timeout, 0);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@@ -370,7 +377,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
- cmd.metadata_len, 0, &cmd.result, timeout, flags);
+ cmd.metadata_len, &cmd.result, timeout, flags);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -402,7 +409,7 @@ struct nvme_uring_cmd_pdu {
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
struct io_uring_cmd *ioucmd)
{
- return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+ return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu);
}
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
@@ -507,7 +514,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (d.addr && d.data_len) {
ret = nvme_map_user_request(req, d.addr,
d.data_len, nvme_to_user_ptr(d.metadata),
- d.metadata_len, 0, ioucmd, vec);
+ d.metadata_len, ioucmd, vec);
if (ret)
return ret;
}
@@ -635,8 +642,6 @@ static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
- BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
-
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873055b9..a85d190942bd 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -165,7 +165,8 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
int srcu_idx;
srcu_idx = srcu_read_lock(&ctrl->srcu);
- list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu)) {
if (!ns->head->disk)
continue;
kblockd_schedule_work(&ns->head->requeue_work);
@@ -209,7 +210,8 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
int srcu_idx;
srcu_idx = srcu_read_lock(&ctrl->srcu);
- list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu)) {
nvme_mpath_clear_current_path(ns);
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -224,7 +226,8 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
int srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
- list_for_each_entry_rcu(ns, &head->list, siblings) {
+ list_for_each_entry_srcu(ns, &head->list, siblings,
+ srcu_read_lock_held(&head->srcu)) {
if (capacity != get_capacity(ns->disk))
clear_bit(NVME_NS_READY, &ns->flags);
}
@@ -257,7 +260,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
struct nvme_ns *found = NULL, *fallback = NULL, *ns;
- list_for_each_entry_rcu(ns, &head->list, siblings) {
+ list_for_each_entry_srcu(ns, &head->list, siblings,
+ srcu_read_lock_held(&head->srcu)) {
if (nvme_path_is_disabled(ns))
continue;
@@ -356,7 +360,8 @@ static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
unsigned int depth;
- list_for_each_entry_rcu(ns, &head->list, siblings) {
+ list_for_each_entry_srcu(ns, &head->list, siblings,
+ srcu_read_lock_held(&head->srcu)) {
if (nvme_path_is_disabled(ns))
continue;
@@ -424,7 +429,8 @@ static bool nvme_available_path(struct nvme_ns_head *head)
if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
return NULL;
- list_for_each_entry_rcu(ns, &head->list, siblings) {
+ list_for_each_entry_srcu(ns, &head->list, siblings,
+ srcu_read_lock_held(&head->srcu)) {
if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
continue;
switch (nvme_ctrl_state(ns->ctrl)) {
@@ -635,8 +641,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
if (head->ids.csi == NVME_CSI_ZNS)
lim.features |= BLK_FEAT_ZONED;
- else
- lim.max_zone_append_sectors = 0;
head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
if (IS_ERR(head->disk))
@@ -785,7 +789,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
return 0;
srcu_idx = srcu_read_lock(&ctrl->srcu);
- list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
+ list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
+ srcu_read_lock_held(&ctrl->srcu)) {
unsigned nsid;
again:
nsid = le32_to_cpu(desc->nsids[n]);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423f536..c4bb8dfe1a45 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -173,6 +173,11 @@ enum nvme_quirks {
* MSI (but not MSI-X) interrupts are broken and never fire.
*/
NVME_QUIRK_BROKEN_MSI = (1 << 21),
+
+ /*
+ * Align dma pool segment size to 512 bytes
+ */
+ NVME_QUIRK_DMAPOOL_ALIGN_512 = (1 << 22),
};
/*
@@ -474,6 +479,7 @@ struct nvme_ns_head {
struct list_head entry;
struct kref ref;
bool shared;
+ bool rotational;
bool passthru_err_log_enabled;
struct nvme_effects_log *effects;
u64 nuse;
@@ -1122,7 +1128,15 @@ static inline void nvme_start_request(struct request *rq)
static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
{
- return ctrl->sgls & ((1 << 0) | (1 << 1));
+ return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED |
+ NVME_CTRL_SGLS_DWORD_ALIGNED);
+}
+
+static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->ops->flags & NVME_F_FABRICS)
+ return true;
+ return ctrl->sgls & NVME_CTRL_SGLS_MSDS;
}
#ifdef CONFIG_NVME_HOST_AUTH
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4b9fda0b1d9a..e2634f437f33 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -43,6 +43,7 @@
*/
#define NVME_MAX_KB_SZ 8192
#define NVME_MAX_SEGS 128
+#define NVME_MAX_META_SEGS 15
#define NVME_MAX_NR_ALLOCATIONS 5
static int use_threaded_interrupts;
@@ -141,8 +142,10 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
u32 last_ps;
bool hmb;
+ struct sg_table *hmb_sgt;
mempool_t *iod_mempool;
+ mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
@@ -153,6 +156,7 @@ struct nvme_dev {
/* host memory buffer support: */
u64 host_mem_size;
u32 nr_host_mem_descs;
+ u32 host_mem_descs_size;
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
@@ -237,6 +241,8 @@ struct nvme_iod {
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
+ struct sg_table meta_sgt;
+ union nvme_descriptor meta_list;
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
};
@@ -504,6 +510,15 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
+static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
+ struct request *req)
+{
+ if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ return false;
+ return req->nr_integrity_segments > 1 ||
+ nvme_req(req)->flags & NVME_REQ_USERCMD;
+}
+
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
int nseg)
{
@@ -516,8 +531,10 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return false;
if (!nvmeq->qid)
return false;
+ if (nvme_pci_metadata_use_sgls(dev, req))
+ return true;
if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return false;
+ return nvme_req(req)->flags & NVME_REQ_USERCMD;
return true;
}
@@ -778,7 +795,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
- if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
+ if (!nvme_pci_metadata_use_sgls(dev, req) &&
+ (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
@@ -822,11 +840,69 @@ out_free_sg:
return ret;
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
+ struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_rw_command *cmnd = &iod->cmd.rw;
+ struct nvme_sgl_desc *sg_list;
+ struct scatterlist *sgl, *sg;
+ unsigned int entries;
+ dma_addr_t sgl_dma;
+ int rc, i;
+
+ iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
+ if (!iod->meta_sgt.sgl)
+ return BLK_STS_RESOURCE;
+
+ sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
+ iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
+ iod->meta_sgt.sgl);
+ if (!iod->meta_sgt.orig_nents)
+ goto out_free_sg;
+
+ rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
+ DMA_ATTR_NO_WARN);
+ if (rc)
+ goto out_free_sg;
+
+ sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ if (!sg_list)
+ goto out_unmap_sg;
+
+ entries = iod->meta_sgt.nents;
+ iod->meta_list.sg_list = sg_list;
+ iod->meta_dma = sgl_dma;
+
+ cmnd->flags = NVME_CMD_SGL_METASEG;
+ cmnd->metadata = cpu_to_le64(sgl_dma);
+
+ sgl = iod->meta_sgt.sgl;
+ if (entries == 1) {
+ nvme_pci_sgl_set_data(sg_list, sgl);
+ return BLK_STS_OK;
+ }
+
+ sgl_dma += sizeof(*sg_list);
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
+ for_each_sg(sgl, sg, entries, i)
+ nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+
+ return BLK_STS_OK;
+
+out_unmap_sg:
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+out_free_sg:
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+ return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct bio_vec bv = rq_integrity_vec(req);
+ struct nvme_command *cmnd = &iod->cmd;
iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->meta_dma))
@@ -835,6 +911,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
return BLK_STS_OK;
}
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+{
+ if (nvme_pci_metadata_use_sgls(dev, req))
+ return nvme_pci_setup_meta_sgls(dev, req);
+ return nvme_pci_setup_meta_mptr(dev, req);
+}
+
static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -843,6 +926,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
iod->aborted = false;
iod->nr_allocations = -1;
iod->sgt.nents = 0;
+ iod->meta_sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
@@ -855,7 +939,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req, &iod->cmd);
+ ret = nvme_map_metadata(dev, req);
if (ret)
goto out_unmap_data;
}
@@ -902,11 +986,12 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
{
+ struct request *req;
+
spin_lock(&nvmeq->sq_lock);
- while (!rq_list_empty(*rqlist)) {
- struct request *req = rq_list_pop(rqlist);
+ while ((req = rq_list_pop(rqlist))) {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_sq_copy_cmd(nvmeq, &iod->cmd);
@@ -929,34 +1014,45 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
{
- struct request *req, *next, *prev = NULL;
- struct request *requeue_list = NULL;
+ struct rq_list submit_list = { };
+ struct rq_list requeue_list = { };
+ struct nvme_queue *nvmeq = NULL;
+ struct request *req;
- rq_list_for_each_safe(rqlist, req, next) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ while ((req = rq_list_pop(rqlist))) {
+ if (nvmeq && nvmeq != req->mq_hctx->driver_data)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ nvmeq = req->mq_hctx->driver_data;
+
+ if (nvme_prep_rq_batch(nvmeq, req))
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
+ }
- if (!nvme_prep_rq_batch(nvmeq, req)) {
- /* detach 'req' and add to remainder list */
- rq_list_move(rqlist, &requeue_list, req, prev);
+ if (nvmeq)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ *rqlist = requeue_list;
+}
- req = prev;
- if (!req)
- continue;
- }
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- if (!next || req->mq_hctx != next->mq_hctx) {
- /* detach rest of list, and submit */
- req->rq_next = NULL;
- nvme_submit_cmds(nvmeq, rqlist);
- *rqlist = next;
- prev = NULL;
- } else
- prev = req;
+ if (!iod->meta_sgt.nents) {
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
}
- *rqlist = requeue_list;
+ dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+ iod->meta_dma);
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
static __always_inline void nvme_pci_unmap_rq(struct request *req)
@@ -964,12 +1060,8 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
- if (blk_integrity_rq(req)) {
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len, rq_dma_dir(req));
- }
+ if (blk_integrity_rq(req))
+ nvme_unmap_metadata(dev, req);
if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req);
@@ -1083,7 +1175,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
DEFINE_IO_COMP_BATCH(iob);
if (nvme_poll_cq(nvmeq, &iob)) {
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
nvme_pci_complete_batch(&iob);
return IRQ_HANDLED;
}
@@ -1951,7 +2043,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
return ret;
}
-static void nvme_free_host_mem(struct nvme_dev *dev)
+static void nvme_free_host_mem_multi(struct nvme_dev *dev)
{
int i;
@@ -1966,18 +2058,54 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
kfree(dev->host_mem_desc_bufs);
dev->host_mem_desc_bufs = NULL;
- dma_free_coherent(dev->dev,
- dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+ if (dev->hmb_sgt)
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ else
+ nvme_free_host_mem_multi(dev);
+
+ dma_free_coherent(dev->dev, dev->host_mem_descs_size,
dev->host_mem_descs, dev->host_mem_descs_dma);
dev->host_mem_descs = NULL;
+ dev->host_mem_descs_size = 0;
dev->nr_host_mem_descs = 0;
}
-static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
+{
+ dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size,
+ DMA_BIDIRECTIONAL, GFP_KERNEL, 0);
+ if (!dev->hmb_sgt)
+ return -ENOMEM;
+
+ dev->host_mem_descs = dma_alloc_coherent(dev->dev,
+ sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
+ GFP_KERNEL);
+ if (!dev->host_mem_descs) {
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ dev->hmb_sgt = NULL;
+ return -ENOMEM;
+ }
+ dev->host_mem_size = size;
+ dev->host_mem_descs_size = sizeof(*dev->host_mem_descs);
+ dev->nr_host_mem_descs = 1;
+
+ dev->host_mem_descs[0].addr =
+ cpu_to_le64(dev->hmb_sgt->sgl->dma_address);
+ dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE);
+ return 0;
+}
+
+static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
u32 chunk_size)
{
struct nvme_host_mem_buf_desc *descs;
- u32 max_entries, len;
+ u32 max_entries, len, descs_size;
dma_addr_t descs_dma;
int i = 0;
void **bufs;
@@ -1990,8 +2118,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
max_entries = dev->ctrl.hmmaxd;
- descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
- &descs_dma, GFP_KERNEL);
+ descs_size = max_entries * sizeof(*descs);
+ descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
+ GFP_KERNEL);
if (!descs)
goto out;
@@ -2020,6 +2149,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dev->host_mem_size = size;
dev->host_mem_descs = descs;
dev->host_mem_descs_dma = descs_dma;
+ dev->host_mem_descs_size = descs_size;
dev->host_mem_desc_bufs = bufs;
return 0;
@@ -2034,8 +2164,7 @@ out_free_bufs:
kfree(bufs);
out_free_descs:
- dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
- descs_dma);
+ dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
out:
dev->host_mem_descs = NULL;
return -ENOMEM;
@@ -2043,13 +2172,23 @@ out:
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
+ unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev);
u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
u64 chunk_size;
+ /*
+ * If there is an IOMMU that can merge pages, try a virtually
+ * non-contiguous allocation for a single segment first.
+ */
+ if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) {
+ if (!nvme_alloc_host_mem_single(dev, preferred))
+ return 0;
+ }
+
/* start big and work our way down */
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
- if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+ if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
nvme_free_host_mem(dev);
@@ -2097,8 +2236,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
}
dev_info(dev->ctrl.device,
- "allocated %lld MiB host memory buffer.\n",
- dev->host_mem_size >> ilog2(SZ_1M));
+ "allocated %lld MiB host memory buffer (%u segment%s).\n",
+ dev->host_mem_size >> ilog2(SZ_1M),
+ dev->nr_host_mem_descs,
+ str_plural(dev->nr_host_mem_descs));
}
ret = nvme_set_host_mem(dev, enable_bits);
@@ -2693,15 +2834,20 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
+ size_t small_align = 256;
+
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE, 0);
if (!dev->prp_page_pool)
return -ENOMEM;
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
/* Optimisation for I/Os between 4k and 128k */
dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, 256, 0);
+ 256, small_align, 0);
if (!dev->prp_small_pool) {
dma_pool_destroy(dev->prp_page_pool);
return -ENOMEM;
@@ -2717,6 +2863,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
+ size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
dev->iod_mempool = mempool_create_node(1,
@@ -2725,7 +2872,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->iod_mempool)
return -ENOMEM;
+
+ dev->iod_meta_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)meta_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->iod_meta_mempool)
+ goto free;
+
return 0;
+free:
+ mempool_destroy(dev->iod_mempool);
+ return -ENOMEM;
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -2790,6 +2948,11 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
+
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
@@ -3057,11 +3220,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * There is no support for SGLs for metadata (yet), so we are limited to
- * a single integrity segment for the separate metadata pointer.
- */
dev->ctrl.max_integrity_segments = 1;
return dev;
@@ -3124,6 +3282,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_disable;
+ if (nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
+ else
+ dev->ctrl.max_integrity_segments = 1;
+
nvme_dbbuf_dma_alloc(dev);
result = nvme_setup_host_mem(dev);
@@ -3166,6 +3329,7 @@ out_disable:
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->iod_meta_mempool);
out_release_prp_pools:
nvme_release_prp_pools(dev);
out_dev_unmap:
@@ -3231,6 +3395,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->iod_meta_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
@@ -3429,12 +3594,10 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES |
NVME_QUIRK_IGNORE_DEV_SUBNQN |
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
- .driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DEALLOCATE_ZEROES, },
+ .driver_data = NVME_QUIRK_STRIPE_SIZE, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ |
@@ -3449,7 +3612,7 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
- .driver_data = NVME_QUIRK_QDEPTH_ONE },
+ .driver_data = NVME_QUIRK_DMAPOOL_ALIGN_512, },
{ PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_BOGUS_NID, },
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index dc7922f22600..cf2d2c5039dd 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -94,109 +94,137 @@ static int nvme_status_to_pr_err(int status)
}
}
-static int nvme_send_pr_command(struct block_device *bdev,
- struct nvme_command *c, void *data, unsigned int data_len)
+static int __nvme_send_pr_command(struct block_device *bdev, u32 cdw10,
+ u32 cdw11, u8 op, void *data, unsigned int data_len)
{
- if (nvme_disk_is_ns_head(bdev->bd_disk))
- return nvme_send_ns_head_pr_command(bdev, c, data, data_len);
+ struct nvme_command c = { 0 };
- return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data,
- data_len);
+ c.common.opcode = op;
+ c.common.cdw10 = cpu_to_le32(cdw10);
+ c.common.cdw11 = cpu_to_le32(cdw11);
+
+ if (nvme_disk_is_ns_head(bdev->bd_disk))
+ return nvme_send_ns_head_pr_command(bdev, &c, data, data_len);
+ return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c,
+ data, data_len);
}
-static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
- u64 key, u64 sa_key, u8 op)
+static int nvme_send_pr_command(struct block_device *bdev, u32 cdw10, u32 cdw11,
+ u8 op, void *data, unsigned int data_len)
{
- struct nvme_command c = { };
- u8 data[16] = { 0, };
int ret;
- put_unaligned_le64(key, &data[0]);
- put_unaligned_le64(sa_key, &data[8]);
-
- c.common.opcode = op;
- c.common.cdw10 = cpu_to_le32(cdw10);
-
- ret = nvme_send_pr_command(bdev, &c, data, sizeof(data));
- if (ret < 0)
- return ret;
-
- return nvme_status_to_pr_err(ret);
+ ret = __nvme_send_pr_command(bdev, cdw10, cdw11, op, data, data_len);
+ return ret < 0 ? ret : nvme_status_to_pr_err(ret);
}
-static int nvme_pr_register(struct block_device *bdev, u64 old,
- u64 new, unsigned flags)
+static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
+ unsigned int flags)
{
+ struct nvmet_pr_register_data data = { 0 };
u32 cdw10;
if (flags & ~PR_FL_IGNORE_KEY)
return -EOPNOTSUPP;
- cdw10 = old ? 2 : 0;
- cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
- cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
- return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+ data.crkey = cpu_to_le64(old_key);
+ data.nrkey = cpu_to_le64(new_key);
+
+ cdw10 = old_key ? NVME_PR_REGISTER_ACT_REPLACE :
+ NVME_PR_REGISTER_ACT_REG;
+ cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0;
+ cdw10 |= NVME_PR_CPTPL_PERSIST;
+
+ return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_register,
+ &data, sizeof(data));
}
static int nvme_pr_reserve(struct block_device *bdev, u64 key,
enum pr_type type, unsigned flags)
{
+ struct nvmet_pr_acquire_data data = { 0 };
u32 cdw10;
if (flags & ~PR_FL_IGNORE_KEY)
return -EOPNOTSUPP;
- cdw10 = nvme_pr_type_from_blk(type) << 8;
- cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
- return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+ data.crkey = cpu_to_le64(key);
+
+ cdw10 = NVME_PR_ACQUIRE_ACT_ACQUIRE;
+ cdw10 |= nvme_pr_type_from_blk(type) << 8;
+ cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0;
+
+ return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire,
+ &data, sizeof(data));
}
static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
enum pr_type type, bool abort)
{
- u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (abort ? 2 : 1);
+ struct nvmet_pr_acquire_data data = { 0 };
+ u32 cdw10;
+
+ data.crkey = cpu_to_le64(old);
+ data.prkey = cpu_to_le64(new);
- return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+ cdw10 = abort ? NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT :
+ NVME_PR_ACQUIRE_ACT_PREEMPT;
+ cdw10 |= nvme_pr_type_from_blk(type) << 8;
+
+ return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire,
+ &data, sizeof(data));
}
static int nvme_pr_clear(struct block_device *bdev, u64 key)
{
- u32 cdw10 = 1 | (key ? 0 : 1 << 3);
+ struct nvmet_pr_release_data data = { 0 };
+ u32 cdw10;
+
+ data.crkey = cpu_to_le64(key);
- return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+ cdw10 = NVME_PR_RELEASE_ACT_CLEAR;
+ cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY;
+
+ return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release,
+ &data, sizeof(data));
}
static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
- u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (key ? 0 : 1 << 3);
+ struct nvmet_pr_release_data data = { 0 };
+ u32 cdw10;
+
+ data.crkey = cpu_to_le64(key);
- return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+ cdw10 = NVME_PR_RELEASE_ACT_RELEASE;
+ cdw10 |= nvme_pr_type_from_blk(type) << 8;
+ cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY;
+
+ return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release,
+ &data, sizeof(data));
}
static int nvme_pr_resv_report(struct block_device *bdev, void *data,
u32 data_len, bool *eds)
{
- struct nvme_command c = { };
+ u32 cdw10, cdw11;
int ret;
- c.common.opcode = nvme_cmd_resv_report;
- c.common.cdw10 = cpu_to_le32(nvme_bytes_to_numd(data_len));
- c.common.cdw11 = cpu_to_le32(NVME_EXTENDED_DATA_STRUCT);
+ cdw10 = nvme_bytes_to_numd(data_len);
+ cdw11 = NVME_EXTENDED_DATA_STRUCT;
*eds = true;
retry:
- ret = nvme_send_pr_command(bdev, &c, data, data_len);
+ ret = __nvme_send_pr_command(bdev, cdw10, cdw11, nvme_cmd_resv_report,
+ data, data_len);
if (ret == NVME_SC_HOST_ID_INCONSIST &&
- c.common.cdw11 == cpu_to_le32(NVME_EXTENDED_DATA_STRUCT)) {
- c.common.cdw11 = 0;
+ cdw11 == NVME_EXTENDED_DATA_STRUCT) {
+ cdw11 = 0;
*eds = false;
goto retry;
}
- if (ret < 0)
- return ret;
-
- return nvme_status_to_pr_err(ret);
+ return ret < 0 ? ret : nvme_status_to_pr_err(ret);
}
static int nvme_pr_read_keys(struct block_device *bdev,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 24a2759798d0..86a2891d9bcc 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1019,7 +1019,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
goto destroy_admin;
}
- if (!(ctrl->ctrl.sgls & (1 << 2))) {
+ if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) {
ret = -EOPNOTSUPP;
dev_err(ctrl->ctrl.device,
"Mandatory keyed sgls are not supported!\n");
@@ -1051,7 +1051,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
}
- if (ctrl->ctrl.sgls & (1 << 20))
+ if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS)
ctrl->use_inline_data = true;
if (ctrl->ctrl.queue_count > 1) {
@@ -1091,13 +1091,7 @@ destroy_io:
}
destroy_admin:
nvme_stop_keep_alive(&ctrl->ctrl);
- nvme_quiesce_admin_queue(&ctrl->ctrl);
- blk_sync_queue(ctrl->ctrl.admin_q);
- nvme_rdma_stop_queue(&ctrl->queues[0]);
- nvme_cancel_admin_tagset(&ctrl->ctrl);
- if (new)
- nvme_remove_admin_tag_set(&ctrl->ctrl);
- nvme_rdma_destroy_admin_queue(ctrl);
+ nvme_rdma_teardown_admin_queue(ctrl, new);
return ret;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3e416af2659f..b127d41dbbfe 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2024,14 +2024,6 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
return __nvme_tcp_alloc_io_queues(ctrl);
}
-static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
-{
- nvme_tcp_stop_io_queues(ctrl);
- if (remove)
- nvme_remove_io_tag_set(ctrl);
- nvme_tcp_free_io_queues(ctrl);
-}
-
static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
{
int ret, nr_queues;
@@ -2101,14 +2093,6 @@ out_free_io_queues:
return ret;
}
-static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
-{
- nvme_tcp_stop_queue(ctrl, 0);
- if (remove)
- nvme_remove_admin_tag_set(ctrl);
- nvme_tcp_free_admin_queue(ctrl);
-}
-
static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
{
int error;
@@ -2163,9 +2147,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
- if (remove)
+ if (remove) {
nvme_unquiesce_admin_queue(ctrl);
- nvme_tcp_destroy_admin_queue(ctrl, remove);
+ nvme_remove_admin_tag_set(ctrl);
+ }
+ nvme_tcp_free_admin_queue(ctrl);
if (ctrl->tls_pskid) {
dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n",
ctrl->tls_pskid);
@@ -2178,14 +2164,15 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
- nvme_quiesce_admin_queue(ctrl);
nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
- if (remove)
+ if (remove) {
nvme_unquiesce_io_queues(ctrl);
- nvme_tcp_destroy_io_queues(ctrl, remove);
+ nvme_remove_io_tag_set(ctrl);
+ }
+ nvme_tcp_free_io_queues(ctrl);
}
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
@@ -2274,11 +2261,13 @@ destroy_io:
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
- nvme_tcp_destroy_io_queues(ctrl, new);
+ if (new)
+ nvme_remove_io_tag_set(ctrl);
+ nvme_tcp_free_io_queues(ctrl);
}
destroy_admin:
nvme_stop_keep_alive(ctrl);
- nvme_tcp_teardown_admin_queue(ctrl, false);
+ nvme_tcp_teardown_admin_queue(ctrl, new);
return ret;
}
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 87c437fc070d..ad25ad1e4041 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -228,27 +228,61 @@ static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const rrega_strs[] = {
+ [0x00] = "register",
+ [0x01] = "unregister",
+ [0x02] = "replace",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 rrega = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 ptpl = (cdw10[3] >> 6) & 0x3;
+ const char *rrega_str;
+
+ if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega])
+ rrega_str = rrega_strs[rrega];
+ else
+ rrega_str = "reserved";
- trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u",
- rrega, iekey, ptpl);
+ trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u",
+ rrega, rrega_str, iekey, ptpl);
trace_seq_putc(p, 0);
return ret;
}
+static const char * const rtype_strs[] = {
+ [0x00] = "reserved",
+ [0x01] = "write exclusive",
+ [0x02] = "exclusive access",
+ [0x03] = "write exclusive registrants only",
+ [0x04] = "exclusive access registrants only",
+ [0x05] = "write exclusive all registrants",
+ [0x06] = "exclusive access all registrants",
+};
+
static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const racqa_strs[] = {
+ [0x00] = "acquire",
+ [0x01] = "preempt",
+ [0x02] = "preempt and abort",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 racqa = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 rtype = cdw10[1];
+ const char *racqa_str = "reserved";
+ const char *rtype_str = "reserved";
- trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u",
- racqa, iekey, rtype);
+ if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa])
+ racqa_str = racqa_strs[racqa];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
+
+ trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s",
+ racqa, racqa_str, iekey, rtype, rtype_str);
trace_seq_putc(p, 0);
return ret;
@@ -256,13 +290,25 @@ static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const rrela_strs[] = {
+ [0x00] = "release",
+ [0x01] = "clear",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 rrela = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 rtype = cdw10[1];
+ const char *rrela_str = "reserved";
+ const char *rtype_str = "reserved";
+
+ if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela])
+ rrela_str = rrela_strs[rrela];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
- trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u",
- rrela, iekey, rtype);
+ trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s",
+ rrela, rrela_str, iekey, rtype, rtype_str);
trace_seq_putc(p, 0);
return ret;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9a06f9d98cd6..382949e18c6a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
lim->features |= BLK_FEAT_ZONED;
lim->max_open_zones = zi->max_open_zones;
lim->max_active_zones = zi->max_active_zones;
- lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+ lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
lim->chunk_sectors = ns->head->zsze =
nvme_lba_to_sect(ns->head, zi->zone_size);
}