diff options
41 files changed, 9875 insertions, 477 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index ed42cb65a19b..b2190b166e4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8153,6 +8153,13 @@ S: Supported F: drivers/nvme/host/ F: include/linux/nvme.h +NVM EXPRESS TARGET DRIVER +M: Christoph Hellwig <[email protected]> +M: Sagi Grimberg <[email protected]> +S: Supported +F: drivers/nvme/target/ + NVMEM FRAMEWORK M: Srinivas Kandagatla <[email protected]> M: Maxime Ripard <[email protected]> diff --git a/block/blk-core.c b/block/blk-core.c index db31a2981223..dd325638e102 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3385,6 +3385,7 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) return false; } +EXPORT_SYMBOL_GPL(blk_poll); #ifdef CONFIG_PM /** diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 56a0c37a3d06..729bac3a673b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -485,6 +485,32 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set) +{ + int i, j, ret = 0; + + if (!set->ops->reinit_request) + goto out; + + for (i = 0; i < set->nr_hw_queues; i++) { + struct blk_mq_tags *tags = set->tags[i]; + + for (j = 0; j < tags->nr_tags; j++) { + if (!tags->rqs[j]) + continue; + + ret = set->ops->reinit_request(set->driver_data, + tags->rqs[j]); + if (ret) + goto out; + } + } + +out: + return ret; +} +EXPORT_SYMBOL_GPL(blk_mq_reinit_tagset); + void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 13f460368759..7aa60c4f56fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -267,6 +267,45 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, } EXPORT_SYMBOL(blk_mq_alloc_request); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, + unsigned int flags, unsigned int hctx_idx) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + struct blk_mq_alloc_data alloc_data; + int ret; + + /* + * If the tag allocator sleeps we could get an allocation for a + * different hardware context. No need to complicate the low level + * allocator for this for the rare use case of a command tied to + * a specific queue. + */ + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) + return ERR_PTR(-EINVAL); + + if (hctx_idx >= q->nr_hw_queues) + return ERR_PTR(-EIO); + + ret = blk_queue_enter(q, true); + if (ret) + return ERR_PTR(ret); + + hctx = q->queue_hw_ctx[hctx_idx]; + ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); + + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); + rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + if (!rq) { + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); + } + + return rq; +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); + static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) { diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1ee391846e0b..e937fcf71769 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -1015,7 +1015,7 @@ probe_err_7: probe_err_6: blk_cleanup_queue(host->breq); probe_err_5: - unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); + unregister_blkdev(host->major, MG_DISK_NAME); probe_err_4: if (!prv_data->use_polling) free_irq(host->irq, host); diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 85a339030e4b..61c68a1f054a 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -27,11 +27,13 @@ config NVM_DEBUG It is required to create/remove targets without IOCTLs. config NVM_GENNVM - tristate "Generic NVM manager for Open-Channel SSDs" + tristate "General Non-Volatile Memory Manager for Open-Channel SSDs" ---help--- - NVM media manager for Open-Channel SSDs that offload management - functionality to device, while keeping data placement and garbage - collection decisions on the host. + Non-volatile memory media manager for Open-Channel SSDs that implements + physical media metadata management and block provisioning API. + + This is the standard media manager for using Open-Channel SSDs, and + required for targets to be instantiated. config NVM_RRPC tristate "Round-robin Hybrid Open-Channel SSD target" diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 160c1a6838e1..9ebd2cfbd849 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -18,8 +18,6 @@ * */ -#include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/list.h> #include <linux/types.h> #include <linux/sem.h> @@ -28,46 +26,42 @@ #include <linux/miscdevice.h> #include <linux/lightnvm.h> #include <linux/sched/sysctl.h> -#include <uapi/linux/lightnvm.h> static LIST_HEAD(nvm_tgt_types); +static DECLARE_RWSEM(nvm_tgtt_lock); static LIST_HEAD(nvm_mgrs); static LIST_HEAD(nvm_devices); -static LIST_HEAD(nvm_targets); static DECLARE_RWSEM(nvm_lock); -static struct nvm_target *nvm_find_target(const char *name) +struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) { - struct nvm_target *tgt; + struct nvm_tgt_type *tmp, *tt = NULL; - list_for_each_entry(tgt, &nvm_targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; + if (lock) + down_write(&nvm_tgtt_lock); - return NULL; -} - -static struct nvm_tgt_type *nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - list_for_each_entry(tt, &nvm_tgt_types, list) - if (!strcmp(name, tt->name)) - return tt; + list_for_each_entry(tmp, &nvm_tgt_types, list) + if (!strcmp(name, tmp->name)) { + tt = tmp; + break; + } - return NULL; + if (lock) + up_write(&nvm_tgtt_lock); + return tt; } +EXPORT_SYMBOL(nvm_find_target_type); int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; - down_write(&nvm_lock); - if (nvm_find_target_type(tt->name)) + down_write(&nvm_tgtt_lock); + if (nvm_find_target_type(tt->name, 0)) ret = -EEXIST; else list_add(&tt->list, &nvm_tgt_types); - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); return ret; } @@ -110,7 +104,7 @@ static struct nvmm_type *nvm_find_mgr_type(const char *name) return NULL; } -struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) +static struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) { struct nvmm_type *mt; int ret; @@ -182,20 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } -struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun, - unsigned long flags) -{ - return dev->mt->get_blk_unlocked(dev, lun, flags); -} -EXPORT_SYMBOL(nvm_get_blk_unlocked); - -/* Assumes that all valid pages have already been moved on release to bm */ -void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) -{ - return dev->mt->put_blk_unlocked(dev, blk); -} -EXPORT_SYMBOL(nvm_put_blk_unlocked); - struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun, unsigned long flags) { @@ -210,6 +190,12 @@ void nvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) } EXPORT_SYMBOL(nvm_put_blk); +void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) +{ + return dev->mt->mark_blk(dev, ppa, type); +} +EXPORT_SYMBOL(nvm_mark_blk); + int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { return dev->mt->submit_io(dev, rqd); @@ -251,9 +237,10 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) EXPORT_SYMBOL(nvm_generic_to_addr_mode); int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, - struct ppa_addr *ppas, int nr_ppas, int vblk) + const struct ppa_addr *ppas, int nr_ppas, int vblk) { int i, plane_cnt, pl_idx; + struct ppa_addr ppa; if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { rqd->nr_ppas = nr_ppas; @@ -278,8 +265,9 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, for (i = 0; i < nr_ppas; i++) { for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppas[i].g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; } } } @@ -337,7 +325,7 @@ static void nvm_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, +static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, int flags, void *buf, int len) { DECLARE_COMPLETION_ONSTACK(wait); @@ -367,7 +355,9 @@ int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode, /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) - while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); + while (!wait_for_completion_io_timeout(&wait, + hang_check * (HZ/2))) + ; else wait_for_completion_io(&wait); @@ -510,7 +500,8 @@ static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) /* The lower page table encoding consists of a list of bytes, where each * has a lower and an upper half. The first half byte maintains the * increment value and every value after is an offset added to the - * previous incrementation value */ + * previous incrementation value + */ dev->lptbl[0] = mlc->pairs[0] & 0xF; for (i = 1; i < dev->lps_per_blk; i++) { p = mlc->pairs[i >> 1]; @@ -596,42 +587,11 @@ err_fmtype: return ret; } -static void nvm_remove_target(struct nvm_target *t) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; - - lockdep_assert_held(&nvm_lock); - - del_gendisk(tdisk); - blk_cleanup_queue(q); - - if (tt->exit) - tt->exit(tdisk->private_data); - - put_disk(tdisk); - - list_del(&t->list); - kfree(t); -} - static void nvm_free_mgr(struct nvm_dev *dev) { - struct nvm_target *tgt, *tmp; - if (!dev->mt) return; - down_write(&nvm_lock); - list_for_each_entry_safe(tgt, tmp, &nvm_targets, list) { - if (tgt->dev != dev) - continue; - - nvm_remove_target(tgt); - } - up_write(&nvm_lock); - dev->mt->unregister_mgr(dev); dev->mt = NULL; } @@ -778,91 +738,6 @@ void nvm_unregister(char *disk_name) } EXPORT_SYMBOL(nvm_unregister); -static const struct block_device_operations nvm_fops = { - .owner = THIS_MODULE, -}; - -static int nvm_create_target(struct nvm_dev *dev, - struct nvm_ioctl_create *create) -{ - struct nvm_ioctl_create_simple *s = &create->conf.s; - struct request_queue *tqueue; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - void *targetdata; - - if (!dev->mt) { - pr_info("nvm: device has no media manager registered.\n"); - return -ENODEV; - } - - down_write(&nvm_lock); - tt = nvm_find_target_type(create->tgttype); - if (!tt) { - pr_err("nvm: target type %s not found\n", create->tgttype); - up_write(&nvm_lock); - return -EINVAL; - } - - t = nvm_find_target(create->tgtname); - if (t) { - pr_err("nvm: target name already exists.\n"); - up_write(&nvm_lock); - return -EINVAL; - } - up_write(&nvm_lock); - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) - return -ENOMEM; - - tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); - if (!tqueue) - goto err_t; - blk_queue_make_request(tqueue, tt->make_rq); - - tdisk = alloc_disk(0); - if (!tdisk) - goto err_queue; - - sprintf(tdisk->disk_name, "%s", create->tgtname); - tdisk->flags = GENHD_FL_EXT_DEVT; - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = &nvm_fops; - tdisk->queue = tqueue; - - targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end); - if (IS_ERR(targetdata)) - goto err_init; - - tdisk->private_data = targetdata; - tqueue->queuedata = targetdata; - - blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - t->type = tt; - t->disk = tdisk; - t->dev = dev; - - down_write(&nvm_lock); - list_add_tail(&t->list, &nvm_targets); - up_write(&nvm_lock); - - return 0; -err_init: - put_disk(tdisk); -err_queue: - blk_cleanup_queue(tqueue); -err_t: - kfree(t); - return -ENOMEM; -} - static int __nvm_configure_create(struct nvm_ioctl_create *create) { struct nvm_dev *dev; @@ -871,11 +746,17 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) down_write(&nvm_lock); dev = nvm_find_nvm_dev(create->dev); up_write(&nvm_lock); + if (!dev) { pr_err("nvm: device not found\n"); return -EINVAL; } + if (!dev->mt) { + pr_info("nvm: device has no media manager registered.\n"); + return -ENODEV; + } + if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) { pr_err("nvm: config type not valid\n"); return -EINVAL; @@ -888,25 +769,7 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create) return -EINVAL; } - return nvm_create_target(dev, create); -} - -static int __nvm_configure_remove(struct nvm_ioctl_remove *remove) -{ - struct nvm_target *t; - - down_write(&nvm_lock); - t = nvm_find_target(remove->tgtname); - if (!t) { - pr_err("nvm: target \"%s\" doesn't exist.\n", remove->tgtname); - up_write(&nvm_lock); - return -EINVAL; - } - - nvm_remove_target(t); - up_write(&nvm_lock); - - return 0; + return dev->mt->create_tgt(dev, create); } #ifdef CONFIG_NVM_DEBUG @@ -941,8 +804,9 @@ static int nvm_configure_show(const char *val) static int nvm_configure_remove(const char *val) { struct nvm_ioctl_remove remove; + struct nvm_dev *dev; char opcode; - int ret; + int ret = 0; ret = sscanf(val, "%c %256s", &opcode, remove.tgtname); if (ret != 2) { @@ -952,7 +816,13 @@ static int nvm_configure_remove(const char *val) remove.flags = 0; - return __nvm_configure_remove(&remove); + list_for_each_entry(dev, &nvm_devices, devices) { + ret = dev->mt->remove_tgt(dev, &remove); + if (!ret) + break; + } + + return ret; } static int nvm_configure_create(const char *val) @@ -1149,6 +1019,8 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg) static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) { struct nvm_ioctl_remove remove; + struct nvm_dev *dev; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1163,7 +1035,13 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return -EINVAL; } - return __nvm_configure_remove(&remove); + list_for_each_entry(dev, &nvm_devices, devices) { + ret = dev->mt->remove_tgt(dev, &remove); + if (!ret) + break; + } + + return ret; } static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ec9fb6876e38..b74174c6d021 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -15,22 +15,160 @@ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, * USA. * - * Implementation of a generic nvm manager for Open-Channel SSDs. + * Implementation of a general nvm manager for Open-Channel SSDs. */ #include "gennvm.h" -static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) +static struct nvm_target *gen_find_target(struct gen_dev *gn, const char *name) { - struct gen_nvm *gn = dev->mp; - struct gennvm_area *area, *prev, *next; + struct nvm_target *tgt; + + list_for_each_entry(tgt, &gn->targets, list) + if (!strcmp(name, tgt->disk->disk_name)) + return tgt; + + return NULL; +} + +static const struct block_device_operations gen_fops = { + .owner = THIS_MODULE, +}; + +static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) +{ + struct gen_dev *gn = dev->mp; + struct nvm_ioctl_create_simple *s = &create->conf.s; + struct request_queue *tqueue; + struct gendisk *tdisk; + struct nvm_tgt_type *tt; + struct nvm_target *t; + void *targetdata; + + tt = nvm_find_target_type(create->tgttype, 1); + if (!tt) { + pr_err("nvm: target type %s not found\n", create->tgttype); + return -EINVAL; + } + + mutex_lock(&gn->lock); + t = gen_find_target(gn, create->tgtname); + if (t) { + pr_err("nvm: target name already exists.\n"); + mutex_unlock(&gn->lock); + return -EINVAL; + } + mutex_unlock(&gn->lock); + + t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); + if (!t) + return -ENOMEM; + + tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node); + if (!tqueue) + goto err_t; + blk_queue_make_request(tqueue, tt->make_rq); + + tdisk = alloc_disk(0); + if (!tdisk) + goto err_queue; + + sprintf(tdisk->disk_name, "%s", create->tgtname); + tdisk->flags = GENHD_FL_EXT_DEVT; + tdisk->major = 0; + tdisk->first_minor = 0; + tdisk->fops = &gen_fops; + tdisk->queue = tqueue; + + targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end); + if (IS_ERR(targetdata)) + goto err_init; + + tdisk->private_data = targetdata; + tqueue->queuedata = targetdata; + + blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect); + + set_capacity(tdisk, tt->capacity(targetdata)); + add_disk(tdisk); + + t->type = tt; + t->disk = tdisk; + t->dev = dev; + + mutex_lock(&gn->lock); + list_add_tail(&t->list, &gn->targets); + mutex_unlock(&gn->lock); + + return 0; +err_init: + put_disk(tdisk); +err_queue: + blk_cleanup_queue(tqueue); +err_t: + kfree(t); + return -ENOMEM; +} + +static void __gen_remove_target(struct nvm_target *t) +{ + struct nvm_tgt_type *tt = t->type; + struct gendisk *tdisk = t->disk; + struct request_queue *q = tdisk->queue; + + del_gendisk(tdisk); + blk_cleanup_queue(q); + + if (tt->exit) + tt->exit(tdisk->private_data); + + put_disk(tdisk); + + list_del(&t->list); + kfree(t); +} + +/** + * gen_remove_tgt - Removes a target from the media manager + * @dev: device + * @remove: ioctl structure with target name to remove. + * + * Returns: + * 0: on success + * 1: on not found + * <0: on error + */ +static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) +{ + struct gen_dev *gn = dev->mp; + struct nvm_target *t; + + if (!gn) + return 1; + + mutex_lock(&gn->lock); + t = gen_find_target(gn, remove->tgtname); + if (!t) { + mutex_unlock(&gn->lock); + return 1; + } + __gen_remove_target(t); + mutex_unlock(&gn->lock); + + return 0; +} + +static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) +{ + struct gen_dev *gn = dev->mp; + struct gen_area *area, *prev, *next; sector_t begin = 0; sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9; if (len > max_sectors) return -EINVAL; - area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL); + area = kmalloc(sizeof(struct gen_area), GFP_KERNEL); if (!area) return -ENOMEM; @@ -64,10 +202,10 @@ static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) return 0; } -static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) +static void gen_put_area(struct nvm_dev *dev, sector_t begin) { - struct gen_nvm *gn = dev->mp; - struct gennvm_area *area; + struct gen_dev *gn = dev->mp; + struct gen_area *area; spin_lock(&dev->lock); list_for_each_entry(area, &gn->area_list, list) { @@ -82,27 +220,27 @@ static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) spin_unlock(&dev->lock); } -static void gennvm_blocks_free(struct nvm_dev *dev) +static void gen_blocks_free(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; int i; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { if (!lun->vlun.blocks) break; vfree(lun->vlun.blocks); } } -static void gennvm_luns_free(struct nvm_dev *dev) +static void gen_luns_free(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; kfree(gn->luns); } -static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) +static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn) { struct gen_lun *lun; int i; @@ -111,7 +249,7 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) if (!gn->luns) return -ENOMEM; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { spin_lock_init(&lun->vlun.lock); INIT_LIST_HEAD(&lun->free_list); INIT_LIST_HEAD(&lun->used_list); @@ -122,14 +260,11 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; - lun->vlun.nr_open_blocks = 0; - lun->vlun.nr_closed_blocks = 0; - lun->vlun.nr_bad_blocks = 0; } return 0; } -static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, +static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa, u8 *blks, int nr_blks) { struct nvm_dev *dev = gn->dev; @@ -149,17 +284,16 @@ static int gennvm_block_bb(struct gen_nvm *gn, struct ppa_addr ppa, blk = &lun->vlun.blocks[i]; list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; lun->vlun.nr_free_blocks--; } return 0; } -static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) +static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) { struct nvm_dev *dev = private; - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; u64 elba = slba + nlb; struct gen_lun *lun; struct nvm_block *blk; @@ -167,7 +301,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) int lun_id; if (unlikely(elba > dev->total_secs)) { - pr_err("gennvm: L2P data from device is out of bounds!\n"); + pr_err("gen: L2P data from device is out of bounds!\n"); return -EINVAL; } @@ -175,7 +309,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) u64 pba = le64_to_cpu(entries[i]); if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { - pr_err("gennvm: L2P data entry is out of bounds!\n"); + pr_err("gen: L2P data entry is out of bounds!\n"); return -EINVAL; } @@ -200,16 +334,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) * block state. The block is assumed to be open. */ list_move_tail(&blk->list, &lun->used_list); - blk->state = NVM_BLK_ST_OPEN; + blk->state = NVM_BLK_ST_TGT; lun->vlun.nr_free_blocks--; - lun->vlun.nr_open_blocks++; } } return 0; } -static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) +static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn) { struct gen_lun *lun; struct nvm_block *block; @@ -222,7 +355,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) if (!blks) return -ENOMEM; - gennvm_for_each_lun(gn, lun, lun_iter) { + gen_for_each_lun(gn, lun, lun_iter) { lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) * dev->blks_per_lun); if (!lun->vlun.blocks) { @@ -256,20 +389,20 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) ret = nvm_get_bb_tbl(dev, ppa, blks); if (ret) - pr_err("gennvm: could not get BB table\n"); + pr_err("gen: could not get BB table\n"); - ret = gennvm_block_bb(gn, ppa, blks, nr_blks); + ret = gen_block_bb(gn, ppa, blks, nr_blks); if (ret) - pr_err("gennvm: BB table map failed\n"); + pr_err("gen: BB table map failed\n"); } } if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) { ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, - gennvm_block_map, dev); + gen_block_map, dev); if (ret) { - pr_err("gennvm: could not read L2P table.\n"); - pr_warn("gennvm: default block initialization"); + pr_err("gen: could not read L2P table.\n"); + pr_warn("gen: default block initialization"); } } @@ -277,67 +410,79 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) return 0; } -static void gennvm_free(struct nvm_dev *dev) +static void gen_free(struct nvm_dev *dev) { - gennvm_blocks_free(dev); - gennvm_luns_free(dev); + gen_blocks_free(dev); + gen_luns_free(dev); kfree(dev->mp); dev->mp = NULL; } -static int gennvm_register(struct nvm_dev *dev) +static int gen_register(struct nvm_dev *dev) { - struct gen_nvm *gn; + struct gen_dev *gn; int ret; if (!try_module_get(THIS_MODULE)) return -ENODEV; - gn = kzalloc(sizeof(struct gen_nvm), GFP_KERNEL); + gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL); if (!gn) return -ENOMEM; gn->dev = dev; gn->nr_luns = dev->nr_luns; INIT_LIST_HEAD(&gn->area_list); + mutex_init(&gn->lock); + INIT_LIST_HEAD(&gn->targets); dev->mp = gn; - ret = gennvm_luns_init(dev, gn); + ret = gen_luns_init(dev, gn); if (ret) { - pr_err("gennvm: could not initialize luns\n"); + pr_err("gen: could not initialize luns\n"); goto err; } - ret = gennvm_blocks_init(dev, gn); + ret = gen_blocks_init(dev, gn); if (ret) { - pr_err("gennvm: could not initialize blocks\n"); + pr_err("gen: could not initialize blocks\n"); goto err; } return 1; err: - gennvm_free(dev); + gen_free(dev); module_put(THIS_MODULE); return ret; } -static void gennvm_unregister(struct nvm_dev *dev) +static void gen_unregister(struct nvm_dev *dev) { - gennvm_free(dev); + struct gen_dev *gn = dev->mp; + struct nvm_target *t, *tmp; + + mutex_lock(&gn->lock); + list_for_each_entry_safe(t, tmp, &gn->targets, list) { + if (t->dev != dev) + continue; + __gen_remove_target(t); + } + mutex_unlock(&gn->lock); + + gen_free(dev); module_put(THIS_MODULE); } -static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, +static struct nvm_block *gen_get_blk(struct nvm_dev *dev, struct nvm_lun *vlun, unsigned long flags) { struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); struct nvm_block *blk = NULL; int is_gc = flags & NVM_IOTYPE_GC; - assert_spin_locked(&vlun->lock); - + spin_lock(&vlun->lock); if (list_empty(&lun->free_list)) { - pr_err_ratelimited("gennvm: lun %u have no free pages available", + pr_err_ratelimited("gen: lun %u have no free pages available", lun->vlun.id); goto out; } @@ -346,88 +491,58 @@ static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, goto out; blk = list_first_entry(&lun->free_list, struct nvm_block, list); - list_move_tail(&blk->list, &lun->used_list); - blk->state = NVM_BLK_ST_OPEN; + list_move_tail(&blk->list, &lun->used_list); + blk->state = NVM_BLK_ST_TGT; lun->vlun.nr_free_blocks--; - lun->vlun.nr_open_blocks++; - out: - return blk; -} - -static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, - struct nvm_lun *vlun, unsigned long flags) -{ - struct nvm_block *blk; - - spin_lock(&vlun->lock); - blk = gennvm_get_blk_unlocked(dev, vlun, flags); spin_unlock(&vlun->lock); return blk; } -static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) +static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk) { struct nvm_lun *vlun = blk->lun; struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); - assert_spin_locked(&vlun->lock); - - if (blk->state & NVM_BLK_ST_OPEN) { - list_move_tail(&blk->list, &lun->free_list); - lun->vlun.nr_open_blocks--; - lun->vlun.nr_free_blocks++; - blk->state = NVM_BLK_ST_FREE; - } else if (blk->state & NVM_BLK_ST_CLOSED) { + spin_lock(&vlun->lock); + if (blk->state & NVM_BLK_ST_TGT) { list_move_tail(&blk->list, &lun->free_list); - lun->vlun.nr_closed_blocks--; lun->vlun.nr_free_blocks++; blk->state = NVM_BLK_ST_FREE; } else if (blk->state & NVM_BLK_ST_BAD) { list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; blk->state = NVM_BLK_ST_BAD; } else { WARN_ON_ONCE(1); - pr_err("gennvm: erroneous block type (%lu -> %u)\n", + pr_err("gen: erroneous block type (%lu -> %u)\n", blk->id, blk->state); list_move_tail(&blk->list, &lun->bb_list); - lun->vlun.nr_bad_blocks++; - blk->state = NVM_BLK_ST_BAD; } -} - -static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) -{ - struct nvm_lun *vlun = blk->lun; - - spin_lock(&vlun->lock); - gennvm_put_blk_unlocked(dev, blk); spin_unlock(&vlun->lock); } -static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) +static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; struct nvm_block *blk; - pr_debug("gennvm: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", + pr_debug("gen: ppa (ch: %u lun: %u blk: %u pg: %u) -> %u\n", ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type); if (unlikely(ppa.g.ch > dev->nr_chnls || ppa.g.lun > dev->luns_per_chnl || ppa.g.blk > dev->blks_per_lun)) { WARN_ON_ONCE(1); - pr_err("gennvm: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", + pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u", ppa.g.ch, dev->nr_chnls, ppa.g.lun, dev->luns_per_chnl, ppa.g.blk, dev->blks_per_lun); return; } - lun = &gn->luns[ppa.g.lun * ppa.g.ch]; + lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun]; blk = &lun->vlun.blocks[ppa.g.blk]; /* will be moved to bb list on put_blk from target */ @@ -435,9 +550,9 @@ static void gennvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type) } /* - * mark block bad in gennvm. It is expected that the target recovers separately + * mark block bad in gen. It is expected that the target recovers separately */ -static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) +static void gen_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) { int bit = -1; int max_secs = dev->ops->max_phys_sect; @@ -447,25 +562,25 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) /* look up blocks and mark them as bad */ if (rqd->nr_ppas == 1) { - gennvm_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); + gen_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD); return; } while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs) - gennvm_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); + gen_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD); } -static void gennvm_end_io(struct nvm_rq *rqd) +static void gen_end_io(struct nvm_rq *rqd) { struct nvm_tgt_instance *ins = rqd->ins; if (rqd->error == NVM_RSP_ERR_FAILWRITE) - gennvm_mark_blk_bad(rqd->dev, rqd); + gen_mark_blk_bad(rqd->dev, rqd); ins->tt->end_io(rqd); } -static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { if (!dev->ops->submit_io) return -ENODEV; @@ -474,11 +589,11 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) nvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - rqd->end_io = gennvm_end_io; + rqd->end_io = gen_end_io; return dev->ops->submit_io(dev, rqd); } -static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, +static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, unsigned long flags) { struct ppa_addr addr = block_to_ppa(dev, blk); @@ -486,19 +601,19 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, return nvm_erase_ppa(dev, &addr, 1); } -static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid) +static int gen_reserve_lun(struct nvm_dev *dev, int lunid) { return test_and_set_bit(lunid, dev->lun_map); } -static void gennvm_release_lun(struct nvm_dev *dev, int lunid) +static void gen_release_lun(struct nvm_dev *dev, int lunid) { WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); } -static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) +static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; if (unlikely(lunid >= dev->nr_luns)) return NULL; @@ -506,66 +621,62 @@ static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) return &gn->luns[lunid].vlun; } -static void gennvm_lun_info_print(struct nvm_dev *dev) +static void gen_lun_info_print(struct nvm_dev *dev) { - struct gen_nvm *gn = dev->mp; + struct gen_dev *gn = dev->mp; struct gen_lun *lun; unsigned int i; - gennvm_for_each_lun(gn, lun, i) { + gen_for_each_lun(gn, lun, i) { spin_lock(&lun->vlun.lock); - pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n", - dev->name, i, - lun->vlun.nr_free_blocks, - lun->vlun.nr_open_blocks, - lun->vlun.nr_closed_blocks, - lun->vlun.nr_bad_blocks); + pr_info("%s: lun%8u\t%u\n", dev->name, i, + lun->vlun.nr_free_blocks); spin_unlock(&lun->vlun.lock); } } -static struct nvmm_type gennvm = { +static struct nvmm_type gen = { .name = "gennvm", .version = {0, 1, 0}, - .register_mgr = gennvm_register, - .unregister_mgr = gennvm_unregister, + .register_mgr = gen_register, + .unregister_mgr = gen_unregister, - .get_blk_unlocked = gennvm_get_blk_unlocked, - .put_blk_unlocked = gennvm_put_blk_unlocked, + .create_tgt = gen_create_tgt, + .remove_tgt = gen_remove_tgt, - .get_blk = gennvm_get_blk, - .put_blk = gennvm_put_blk, + .get_blk = gen_get_blk, + .put_blk = gen_put_blk, - .submit_io = gennvm_submit_io, - .erase_blk = gennvm_erase_blk, + .submit_io = gen_submit_io, + .erase_blk = gen_erase_blk, - .mark_blk = gennvm_mark_blk, + .mark_blk = gen_mark_blk, - .get_lun = gennvm_get_lun, - .reserve_lun = gennvm_reserve_lun, - .release_lun = gennvm_release_lun, - .lun_info_print = gennvm_lun_info_print, + .get_lun = gen_get_lun, + .reserve_lun = gen_reserve_lun, + .release_lun = gen_release_lun, + .lun_info_print = gen_lun_info_print, - .get_area = gennvm_get_area, - .put_area = gennvm_put_area, + .get_area = gen_get_area, + .put_area = gen_put_area, }; -static int __init gennvm_module_init(void) +static int __init gen_module_init(void) { - return nvm_register_mgr(&gennvm); + return nvm_register_mgr(&gen); } -static void gennvm_module_exit(void) +static void gen_module_exit(void) { - nvm_unregister_mgr(&gennvm); + nvm_unregister_mgr(&gen); } -module_init(gennvm_module_init); -module_exit(gennvm_module_exit); +module_init(gen_module_init); +module_exit(gen_module_exit); MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Generic media manager for Open-Channel SSDs"); +MODULE_DESCRIPTION("General media manager for Open-Channel SSDs"); diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h index 04d7c23cfc61..8ecfa817d21d 100644 --- a/drivers/lightnvm/gennvm.h +++ b/drivers/lightnvm/gennvm.h @@ -34,20 +34,24 @@ struct gen_lun { */ }; -struct gen_nvm { +struct gen_dev { struct nvm_dev *dev; int nr_luns; struct gen_lun *luns; struct list_head area_list; + + struct mutex lock; + struct list_head targets; }; -struct gennvm_area { +struct gen_area { struct list_head list; sector_t begin; sector_t end; /* end is excluded */ }; -#define gennvm_for_each_lun(bm, lun, i) \ + +#define gen_for_each_lun(bm, lun, i) \ for ((i) = 0, lun = &(bm)->luns[0]; \ (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index de86d72dcdf0..fa1ab0421489 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -48,7 +48,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) } static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, - unsigned len) + unsigned int len) { sector_t i; @@ -96,10 +96,13 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio) sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE; struct nvm_rq *rqd; - do { + while (1) { rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len); + if (rqd) + break; + schedule(); - } while (!rqd); + } if (IS_ERR(rqd)) { pr_err("rrpc: unable to acquire inflight IO\n"); @@ -172,39 +175,32 @@ static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr) } /* requires lun->lock taken */ -static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk) +static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk, + struct rrpc_block **cur_rblk) { struct rrpc *rrpc = rlun->rrpc; - BUG_ON(!rblk); - - if (rlun->cur) { - spin_lock(&rlun->cur->lock); - WARN_ON(!block_is_full(rrpc, rlun->cur)); - spin_unlock(&rlun->cur->lock); + if (*cur_rblk) { + spin_lock(&(*cur_rblk)->lock); + WARN_ON(!block_is_full(rrpc, *cur_rblk)); + spin_unlock(&(*cur_rblk)->lock); } - rlun->cur = rblk; + *cur_rblk = new_rblk; } static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, unsigned long flags) { - struct nvm_lun *lun = rlun->parent; struct nvm_block *blk; struct rrpc_block *rblk; - spin_lock(&lun->lock); - blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags); + blk = nvm_get_blk(rrpc->dev, rlun->parent, flags); if (!blk) { pr_err("nvm: rrpc: cannot get new block from media manager\n"); - spin_unlock(&lun->lock); return NULL; } rblk = rrpc_get_rblk(rlun, blk->id); - list_add_tail(&rblk->list, &rlun->open_list); - spin_unlock(&lun->lock); - blk->priv = rblk; bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk); rblk->next_page = 0; @@ -216,13 +212,7 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) { - struct rrpc_lun *rlun = rblk->rlun; - struct nvm_lun *lun = rlun->parent; - - spin_lock(&lun->lock); - nvm_put_blk_unlocked(rrpc->dev, rblk->parent); - list_del(&rblk->list); - spin_unlock(&lun->lock); + nvm_put_blk(rrpc->dev, rblk->parent); } static void rrpc_put_blks(struct rrpc *rrpc) @@ -508,21 +498,11 @@ static void rrpc_gc_queue(struct work_struct *work) struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; struct rrpc_lun *rlun = rblk->rlun; - struct nvm_lun *lun = rblk->parent->lun; - struct nvm_block *blk = rblk->parent; spin_lock(&rlun->lock); list_add_tail(&rblk->prio, &rlun->prio_list); spin_unlock(&rlun->lock); - spin_lock(&lun->lock); - lun->nr_open_blocks--; - lun->nr_closed_blocks++; - blk->state &= ~NVM_BLK_ST_OPEN; - blk->state |= NVM_BLK_ST_CLOSED; - list_move_tail(&rblk->list, &rlun->closed_list); - spin_unlock(&lun->lock); - mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' is full, allow GC (sched)\n", rblk->parent->id); @@ -596,21 +576,20 @@ out: return addr; } -/* Simple round-robin Logical to physical address translation. - * - * Retrieve the mapping using the active append point. Then update the ap for - * the next write to the disk. +/* Map logical address to a physical page. The mapping implements a round robin + * approach and allocates a page from the next lun available. * - * Returns rrpc_addr with the physical address and block. Remember to return to - * rrpc->addr_cache when request is finished. + * Returns rrpc_addr with the physical address and block. Returns NULL if no + * blocks in the next rlun are available. */ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr, int is_gc) { struct rrpc_lun *rlun; - struct rrpc_block *rblk; + struct rrpc_block *rblk, **cur_rblk; struct nvm_lun *lun; u64 paddr; + int gc_force = 0; rlun = rrpc_get_lun_rr(rrpc, is_gc); lun = rlun->parent; @@ -618,41 +597,65 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr, if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4) return NULL; - spin_lock(&rlun->lock); + /* + * page allocation steps: + * 1. Try to allocate new page from current rblk + * 2a. If succeed, proceed to map it in and return + * 2b. If fail, first try to allocate a new block from media manger, + * and then retry step 1. Retry until the normal block pool is + * exhausted. + * 3. If exhausted, and garbage collector is requesting the block, + * go to the reserved block and retry step 1. + * In the case that this fails as well, or it is not GC + * requesting, report not able to retrieve a block and let the + * caller handle further processing. + */ + spin_lock(&rlun->lock); + cur_rblk = &rlun->cur; rblk = rlun->cur; retry: paddr = rrpc_alloc_addr(rrpc, rblk); - if (paddr == ADDR_EMPTY) { - rblk = rrpc_get_blk(rrpc, rlun, 0); - if (rblk) { - rrpc_set_lun_cur(rlun, rblk); - goto retry; - } + if (paddr != ADDR_EMPTY) + goto done; - if (is_gc) { - /* retry from emergency gc block */ - paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); - if (paddr == ADDR_EMPTY) { - rblk = rrpc_get_blk(rrpc, rlun, 1); - if (!rblk) { - pr_err("rrpc: no more blocks"); - goto err; - } - - rlun->gc_cur = rblk; - paddr = rrpc_alloc_addr(rrpc, rlun->gc_cur); - } - rblk = rlun->gc_cur; - } + if (!list_empty(&rlun->wblk_list)) { +new_blk: + rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block, + prio); + rrpc_set_lun_cur(rlun, rblk, cur_rblk); + list_del(&rblk->prio); + goto retry; + } + spin_unlock(&rlun->lock); + + rblk = rrpc_get_blk(rrpc, rlun, gc_force); + if (rblk) { + spin_lock(&rlun->lock); + list_add_tail(&rblk->prio, &rlun->wblk_list); + /* + * another thread might already have added a new block, + * Therefore, make sure that one is used, instead of the + * one just added. + */ + goto new_blk; } + if (unlikely(is_gc) && !gc_force) { + /* retry from emergency gc block */ + cur_rblk = &rlun->gc_cur; + rblk = rlun->gc_cur; + gc_force = 1; + spin_lock(&rlun->lock); + goto retry; + } + + pr_err("rrpc: failed to allocate new block\n"); + return NULL; +done: spin_unlock(&rlun->lock); return rrpc_update_map(rrpc, laddr, rblk, paddr); -err: - spin_unlock(&rlun->lock); - return NULL; } static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk) @@ -1196,8 +1199,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) rlun->rrpc = rrpc; INIT_LIST_HEAD(&rlun->prio_list); - INIT_LIST_HEAD(&rlun->open_list); - INIT_LIST_HEAD(&rlun->closed_list); + INIT_LIST_HEAD(&rlun->wblk_list); INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); spin_lock_init(&rlun->lock); @@ -1338,14 +1340,13 @@ static int rrpc_luns_configure(struct rrpc *rrpc) rblk = rrpc_get_blk(rrpc, rlun, 0); if (!rblk) goto err; - - rrpc_set_lun_cur(rlun, rblk); + rrpc_set_lun_cur(rlun, rblk, &rlun->cur); /* Emergency gc block */ rblk = rrpc_get_blk(rrpc, rlun, 1); if (!rblk) goto err; - rlun->gc_cur = rblk; + rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur); } return 0; diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 87e84b5fc1cc..5e87d52cb983 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -56,7 +56,6 @@ struct rrpc_block { struct nvm_block *parent; struct rrpc_lun *rlun; struct list_head prio; - struct list_head list; #define MAX_INVALID_PAGES_STORAGE 8 /* Bitmap for invalid page intries */ @@ -77,13 +76,7 @@ struct rrpc_lun { struct rrpc_block *blocks; /* Reference to block allocation */ struct list_head prio_list; /* Blocks that may be GC'ed */ - struct list_head open_list; /* In-use open blocks. These are blocks - * that can be both written to and read - * from - */ - struct list_head closed_list; /* In-use closed blocks. These are - * blocks that can _only_ be read from - */ + struct list_head wblk_list; /* Queued blocks to be written to */ struct work_struct ws_gc; @@ -188,7 +181,7 @@ static inline int request_intersects(struct rrpc_inflight_rq *r, } static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned pages, struct rrpc_inflight_rq *r) + unsigned int pages, struct rrpc_inflight_rq *r) { sector_t laddr_end = laddr + pages - 1; struct rrpc_inflight_rq *rtmp; @@ -213,7 +206,7 @@ static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, } static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, - unsigned pages, + unsigned int pages, struct rrpc_inflight_rq *r) { BUG_ON((laddr + pages) > rrpc->nr_sects); diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c index 994697ac786e..a75bd28aaca3 100644 --- a/drivers/lightnvm/sysblk.c +++ b/drivers/lightnvm/sysblk.c @@ -39,7 +39,8 @@ static inline int scan_ppa_idx(int row, int blkid) return (row * MAX_BLKS_PR_SYSBLK) + blkid; } -void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) +static void nvm_sysblk_to_cpu(struct nvm_sb_info *info, + struct nvm_system_block *sb) { info->seqnr = be32_to_cpu(sb->seqnr); info->erase_cnt = be32_to_cpu(sb->erase_cnt); @@ -48,7 +49,8 @@ void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); } -void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) +static void nvm_cpu_to_sysblk(struct nvm_system_block *sb, + struct nvm_sb_info *info) { sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); sb->seqnr = cpu_to_be32(info->seqnr); @@ -86,7 +88,7 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) return nr_rows; } -void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, +static void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, struct ppa_addr *sysblk_ppas) { memset(s, 0, sizeof(struct sysblk_scan)); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9eaf1d6e8302..864e673aec39 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -112,7 +112,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) EXPORT_SYMBOL(closure_wait); /** - * closure_sync - sleep until a closure a closure has nothing left to wait on + * closure_sync - sleep until a closure has nothing left to wait on * * Sleeps until the refcount hits 1 - the thread that's running the closure owns * the last refcount. diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 782cc2c8a185..9b2fe2d3e3a9 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -31,7 +31,8 @@ * passing it, as you might expect, the function to run when nothing is pending * and the workqueue to run that function out of. * - * continue_at() also, critically, is a macro that returns the calling function. + * continue_at() also, critically, requires a 'return' immediately following the + * location where this macro is referenced, to return to the calling function. * There's good reason for this. * * To use safely closures asynchronously, they must always have a refcount while diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d542438b18df..88ef6d14cce3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -134,7 +134,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, case BCACHE_SB_VERSION_CDEV: case BCACHE_SB_VERSION_CDEV_WITH_UUID: sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->block_size = le16_to_cpu(s->block_size); sb->bucket_size = le16_to_cpu(s->bucket_size); sb->nr_in_set = le16_to_cpu(s->nr_in_set); @@ -1806,7 +1805,7 @@ void bch_cache_release(struct kobject *kobj) module_put(THIS_MODULE); } -static int cache_alloc(struct cache_sb *sb, struct cache *ca) +static int cache_alloc(struct cache *ca) { size_t free; struct bucket *b; @@ -1861,7 +1860,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(sb, ca); + ret = cache_alloc(ca); if (ret != 0) goto err; diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index a39d9431eaec..b7c78a5b1f7a 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig @@ -1 +1,2 @@ source "drivers/nvme/host/Kconfig" +source "drivers/nvme/target/Kconfig" diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile index 9421e829d2a9..0096a7fd1431 100644 --- a/drivers/nvme/Makefile +++ b/drivers/nvme/Makefile @@ -1,2 +1,3 @@ obj-y += host/ +obj-y += target/ diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index d296fc3ae06e..db39d53cdfb9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -24,3 +24,22 @@ config BLK_DEV_NVME_SCSI to say N here, unless you run a distro that abuses the SCSI emulation to provide stable device names for mount by id, like some OpenSuSE and SLES versions. + +config NVME_FABRICS + tristate + +config NVME_RDMA + tristate "NVM Express over Fabrics RDMA host driver" + depends on INFINIBAND + depends on BLK_DEV_NVME + select NVME_FABRICS + select SG_POOL + help + This provides support for the NVMe over Fabrics protocol using + the RDMA (Infiniband, RoCE, iWarp) transport. This allows you + to use remote block devices exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 9a3ca892b4a7..47abcec23514 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,8 +1,14 @@ obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o +obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o nvme-core-y := core.o nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o + +nvme-fabrics-y += fabrics.o + +nvme-rdma-y += rdma.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 959173bfe9f0..500b790442c9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -30,6 +30,7 @@ #include <asm/unaligned.h> #include "nvme.h" +#include "fabrics.h" #define NVME_MINORS (1U << MINORBITS) @@ -85,7 +86,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { + case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -96,6 +99,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: + case NVME_CTRL_RECONNECTING: + changed = true; + /* FALLTHRU */ + default: + break; + } + break; + case NVME_CTRL_RECONNECTING: + switch (old_state) { + case NVME_CTRL_LIVE: changed = true; /* FALLTHRU */ default: @@ -106,6 +119,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: + case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -191,11 +205,16 @@ void nvme_requeue_req(struct request *req) EXPORT_SYMBOL_GPL(nvme_requeue_req); struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags) + struct nvme_command *cmd, unsigned int flags, int qid) { struct request *req; - req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); + if (qid == NVME_QID_ANY) { + req = blk_mq_alloc_request(q, nvme_is_write(cmd), flags); + } else { + req = blk_mq_alloc_request_hctx(q, nvme_is_write(cmd), flags, + qid ? qid - 1 : 0); + } if (IS_ERR(req)) return req; @@ -323,12 +342,12 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct nvme_completion *cqe, void *buffer, unsigned bufflen, - unsigned timeout) + unsigned timeout, int qid, int at_head, int flags) { struct request *req; int ret; - req = nvme_alloc_request(q, cmd, 0); + req = nvme_alloc_request(q, cmd, flags, qid); if (IS_ERR(req)) return PTR_ERR(req); @@ -341,17 +360,19 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - blk_execute_rq(req->q, NULL, req, 0); + blk_execute_rq(req->q, NULL, req, at_head); ret = req->errors; out: blk_mq_free_request(req); return ret; } +EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { - return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + NVME_QID_ANY, 0, 0); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -369,7 +390,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0); + req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); if (IS_ERR(req)) return PTR_ERR(req); @@ -455,6 +476,74 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, result, timeout); } +static void nvme_keep_alive_end_io(struct request *rq, int error) +{ + struct nvme_ctrl *ctrl = rq->end_io_data; + + blk_mq_free_request(rq); + + if (error) { + dev_err(ctrl->device, + "failed nvme_keep_alive_end_io error=%d\n", error); + return; + } + + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static int nvme_keep_alive(struct nvme_ctrl *ctrl) +{ + struct nvme_command c; + struct request *rq; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_keep_alive; + + rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, + NVME_QID_ANY); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + rq->timeout = ctrl->kato * HZ; + rq->end_io_data = ctrl; + + blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); + + return 0; +} + +static void nvme_keep_alive_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, ka_work); + + if (nvme_keep_alive(ctrl)) { + /* allocation failure, reset the controller */ + dev_err(ctrl->device, "keep-alive failed\n"); + ctrl->ops->reset_ctrl(ctrl); + return; + } +} + +void nvme_start_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} +EXPORT_SYMBOL_GPL(nvme_start_keep_alive); + +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) +{ + if (unlikely(ctrl->kato == 0)) + return; + + cancel_delayed_work_sync(&ctrl->ka_work); +} +EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); + int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; @@ -516,10 +605,11 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); + c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); if (ret >= 0) *result = le32_to_cpu(cqe.result); return ret; @@ -534,11 +624,12 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); + c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); if (ret >= 0) *result = le32_to_cpu(cqe.result); return ret; @@ -1168,9 +1259,33 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } nvme_set_queue_limits(ctrl, ctrl->admin_q); + ctrl->sgls = le32_to_cpu(id->sgls); + ctrl->kas = le16_to_cpu(id->kas); + + if (ctrl->ops->is_fabrics) { + ctrl->icdoff = le16_to_cpu(id->icdoff); + ctrl->ioccsz = le32_to_cpu(id->ioccsz); + ctrl->iorcsz = le32_to_cpu(id->iorcsz); + ctrl->maxcmd = le16_to_cpu(id->maxcmd); + + /* + * In fabrics we need to verify the cntlid matches the + * admin connect + */ + if (ctrl->cntlid != le16_to_cpu(id->cntlid)) + ret = -EINVAL; + + if (!ctrl->opts->discovery_nqn && !ctrl->kas) { + dev_err(ctrl->dev, + "keep-alive support is mandatory for fabrics\n"); + ret = -EINVAL; + } + } else { + ctrl->cntlid = le16_to_cpu(id->cntlid); + } kfree(id); - return 0; + return ret; } EXPORT_SYMBOL_GPL(nvme_init_identify); @@ -1352,7 +1467,7 @@ static struct attribute *nvme_ns_attrs[] = { NULL, }; -static umode_t nvme_attrs_are_visible(struct kobject *kobj, +static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); @@ -1371,7 +1486,7 @@ static umode_t nvme_attrs_are_visible(struct kobject *kobj, static const struct attribute_group nvme_ns_attr_group = { .attrs = nvme_ns_attrs, - .is_visible = nvme_attrs_are_visible, + .is_visible = nvme_ns_attrs_are_visible, }; #define nvme_show_str_function(field) \ @@ -1397,6 +1512,49 @@ nvme_show_str_function(serial); nvme_show_str_function(firmware_rev); nvme_show_int_function(cntlid); +static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (device_remove_file_self(dev, attr)) + ctrl->ops->delete_ctrl(ctrl); + return count; +} +static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + +static ssize_t nvme_sysfs_show_transport(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); +} +static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); + +static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", + ctrl->ops->get_subsysnqn(ctrl)); +} +static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); + +static ssize_t nvme_sysfs_show_address(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); +} +static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -1404,11 +1562,38 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, &dev_attr_cntlid.attr, + &dev_attr_delete_controller.attr, + &dev_attr_transport.attr, + &dev_attr_subsysnqn.attr, + &dev_attr_address.attr, NULL }; +#define CHECK_ATTR(ctrl, a, name) \ + if ((a) == &dev_attr_##name.attr && \ + !(ctrl)->ops->get_##name) \ + return 0 + +static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_delete_controller.attr) { + if (!ctrl->ops->delete_ctrl) + return 0; + } + + CHECK_ATTR(ctrl, a, subsysnqn); + CHECK_ATTR(ctrl, a, address); + + return a->mode; +} + static struct attribute_group nvme_dev_attrs_group = { - .attrs = nvme_dev_attrs, + .attrs = nvme_dev_attrs, + .is_visible = nvme_dev_attrs_are_visible, }; static const struct attribute_group *nvme_dev_attr_groups[] = { diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c new file mode 100644 index 000000000000..b86b6379ef0c --- /dev/null +++ b/drivers/nvme/host/fabrics.c @@ -0,0 +1,962 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include "nvme.h" +#include "fabrics.h" + +static LIST_HEAD(nvmf_transports); +static DEFINE_MUTEX(nvmf_transports_mutex); + +static LIST_HEAD(nvmf_hosts); +static DEFINE_MUTEX(nvmf_hosts_mutex); + +static struct nvmf_host *nvmf_default_host; + +static struct nvmf_host *__nvmf_host_find(const char *hostnqn) +{ + struct nvmf_host *host; + + list_for_each_entry(host, &nvmf_hosts, list) { + if (!strcmp(host->nqn, hostnqn)) + return host; + } + + return NULL; +} + +static struct nvmf_host *nvmf_host_add(const char *hostnqn) +{ + struct nvmf_host *host; + + mutex_lock(&nvmf_hosts_mutex); + host = __nvmf_host_find(hostnqn); + if (host) + goto out_unlock; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + goto out_unlock; + + kref_init(&host->ref); + memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + uuid_le_gen(&host->id); + + list_add_tail(&host->list, &nvmf_hosts); +out_unlock: + mutex_unlock(&nvmf_hosts_mutex); + return host; +} + +static struct nvmf_host *nvmf_host_default(void) +{ + struct nvmf_host *host; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return NULL; + + kref_init(&host->ref); + uuid_le_gen(&host->id); + snprintf(host->nqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUl", &host->id); + + mutex_lock(&nvmf_hosts_mutex); + list_add_tail(&host->list, &nvmf_hosts); + mutex_unlock(&nvmf_hosts_mutex); + + return host; +} + +static void nvmf_host_destroy(struct kref *ref) +{ + struct nvmf_host *host = container_of(ref, struct nvmf_host, ref); + + kfree(host); +} + +static void nvmf_host_put(struct nvmf_host *host) +{ + if (host) + kref_put(&host->ref, nvmf_host_destroy); +} + +/** + * nvmf_get_address() - Get address/port + * @ctrl: Host NVMe controller instance which we got the address + * @buf: OUTPUT parameter that will contain the address/port + * @size: buffer size + */ +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", + ctrl->opts->traddr, ctrl->opts->trsvcid); +} +EXPORT_SYMBOL_GPL(nvmf_get_address); + +/** + * nvmf_get_subsysnqn() - Get subsystem NQN + * @ctrl: Host NVMe controller instance which we got the NQN + */ +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->subsysnqn; +} +EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); + +/** + * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 32-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read32); + +/** + * nvmf_reg_read64() - NVMe Fabrics "Property Get" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: OUTPUT parameter that will contain the value of + * the property after a successful read. + * + * Used by the host system to retrieve a 64-bit capsule property value + * from an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful read + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_get.opcode = nvme_fabrics_command; + cmd.prop_get.fctype = nvme_fabrics_type_property_get; + cmd.prop_get.attrib = 1; + cmd.prop_get.offset = cpu_to_le32(off); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + + if (ret >= 0) + *val = le64_to_cpu(cqe.result64); + if (unlikely(ret != 0)) + dev_err(ctrl->device, + "Property Get error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_read64); + +/** + * nvmf_reg_write32() - NVMe Fabrics "Property Write" API function. + * @ctrl: Host NVMe controller instance maintaining the admin + * queue used to submit the property read command to + * the allocated NVMe controller resource on the target system. + * @off: Starting offset value of the targeted property + * register (see the fabrics section of the NVMe standard). + * @val: Input parameter that contains the value to be + * written to the property. + * + * Used by the NVMe host system to write a 32-bit capsule property value + * to an NVMe controller on the target system. + * + * ("Capsule property" is an "PCIe register concept" applied to the + * NVMe fabrics space.) + * + * Return: + * 0: successful write + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + struct nvme_command cmd; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.prop_set.opcode = nvme_fabrics_command; + cmd.prop_set.fctype = nvme_fabrics_type_property_set; + cmd.prop_set.attrib = 0; + cmd.prop_set.offset = cpu_to_le32(off); + cmd.prop_set.value = cpu_to_le64(val); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + NVME_QID_ANY, 0, 0); + if (unlikely(ret)) + dev_err(ctrl->device, + "Property Set error: %d, offset %#x\n", + ret > 0 ? ret & ~NVME_SC_DNR : ret, off); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_reg_write32); + +/** + * nvmf_log_connect_error() - Error-parsing-diagnostic print + * out function for connect() errors. + * + * @ctrl: the specific /dev/nvmeX device that had the error. + * + * @errval: Error code to be decoded in a more human-friendly + * printout. + * + * @offset: For use with the NVMe error code NVME_SC_CONNECT_INVALID_PARAM. + * + * @cmd: This is the SQE portion of a submission capsule. + * + * @data: This is the "Data" portion of a submission capsule. + */ +static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, + int errval, int offset, struct nvme_command *cmd, + struct nvmf_connect_data *data) +{ + int err_sctype = errval & (~NVME_SC_DNR); + + switch (err_sctype) { + + case (NVME_SC_CONNECT_INVALID_PARAM): + if (offset >> 16) { + char *inv_data = "Connect Invalid Data Parameter"; + + switch (offset & 0xffff) { + case (offsetof(struct nvmf_connect_data, cntlid)): + dev_err(ctrl->device, + "%s, cntlid: %d\n", + inv_data, data->cntlid); + break; + case (offsetof(struct nvmf_connect_data, hostnqn)): + dev_err(ctrl->device, + "%s, hostnqn \"%s\"\n", + inv_data, data->hostnqn); + break; + case (offsetof(struct nvmf_connect_data, subsysnqn)): + dev_err(ctrl->device, + "%s, subsysnqn \"%s\"\n", + inv_data, data->subsysnqn); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_data, offset & 0xffff); + break; + } + } else { + char *inv_sqe = "Connect Invalid SQE Parameter"; + + switch (offset) { + case (offsetof(struct nvmf_connect_command, qid)): + dev_err(ctrl->device, + "%s, qid %d\n", + inv_sqe, cmd->connect.qid); + break; + default: + dev_err(ctrl->device, + "%s, starting byte offset: %d\n", + inv_sqe, offset); + } + } + break; + default: + dev_err(ctrl->device, + "Connect command failed, error wo/DNR bit: %d\n", + err_sctype); + break; + } /* switch (err_sctype) */ +} + +/** + * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to request + * a new NVMe controller allocation on the target + * system and establish an NVMe Admin connection to + * that controller. + * + * This function enables an NVMe host device to request a new allocation of + * an NVMe controller resource on a target system as well establish a + * fabrics-protocol connection of the NVMe Admin queue between the + * host system device and the allocated NVMe controller on the + * target system via a NVMe Fabrics "Connect" command. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + * + */ +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) +{ + struct nvme_command cmd; + struct nvme_completion cqe; + struct nvmf_connect_data *data; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = 0; + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + /* + * Set keep-alive timeout in seconds granularity (ms * 1000) + * and add a grace period for controller kato enforcement + */ + cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + data->cntlid = cpu_to_le16(0xffff); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, + data, sizeof(*data), 0, NVME_QID_ANY, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + goto out_free_data; + } + + ctrl->cntlid = le16_to_cpu(cqe.result16); + +out_free_data: + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue); + +/** + * nvmf_connect_io_queue() - NVMe Fabrics I/O Queue "Connect" + * API function. + * @ctrl: Host nvme controller instance used to establish an + * NVMe I/O queue connection to the already allocated NVMe + * controller on the target system. + * @qid: NVMe I/O queue number for the new I/O connection between + * host and target (note qid == 0 is illegal as this is + * the Admin queue, per NVMe standard). + * + * This function issues a fabrics-protocol connection + * of a NVMe I/O queue (via NVMe Fabrics "Connect" command) + * between the host system device and the allocated NVMe controller + * on the target system. + * + * Return: + * 0: success + * > 0: NVMe error status code + * < 0: Linux errno error code + */ +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) +{ + struct nvme_command cmd; + struct nvmf_connect_data *data; + struct nvme_completion cqe; + int ret; + + memset(&cmd, 0, sizeof(cmd)); + cmd.connect.opcode = nvme_fabrics_command; + cmd.connect.fctype = nvme_fabrics_type_connect; + cmd.connect.qid = cpu_to_le16(qid); + cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_le)); + data->cntlid = cpu_to_le16(ctrl->cntlid); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe, + data, sizeof(*data), 0, qid, 1, + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + if (ret) { + nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result), + &cmd, data); + } + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); + +/** + * nvmf_register_transport() - NVMe Fabrics Library registration function. + * @ops: Transport ops instance to be registered to the + * common fabrics library. + * + * API function that registers the type of specific transport fabric + * being implemented to the common NVMe fabrics library. Part of + * the overall init sequence of starting up a fabrics driver. + */ +void nvmf_register_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_add_tail(&ops->entry, &nvmf_transports); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_register_transport); + +/** + * nvmf_unregister_transport() - NVMe Fabrics Library unregistration function. + * @ops: Transport ops instance to be unregistered from the + * common fabrics library. + * + * Fabrics API function that unregisters the type of specific transport + * fabric being implemented from the common NVMe fabrics library. + * Part of the overall exit sequence of unloading the implemented driver. + */ +void nvmf_unregister_transport(struct nvmf_transport_ops *ops) +{ + mutex_lock(&nvmf_transports_mutex); + list_del(&ops->entry); + mutex_unlock(&nvmf_transports_mutex); +} +EXPORT_SYMBOL_GPL(nvmf_unregister_transport); + +static struct nvmf_transport_ops *nvmf_lookup_transport( + struct nvmf_ctrl_options *opts) +{ + struct nvmf_transport_ops *ops; + + lockdep_assert_held(&nvmf_transports_mutex); + + list_for_each_entry(ops, &nvmf_transports, entry) { + if (strcmp(ops->name, opts->transport) == 0) + return ops; + } + + return NULL; +} + +static const match_table_t opt_tokens = { + { NVMF_OPT_TRANSPORT, "transport=%s" }, + { NVMF_OPT_TRADDR, "traddr=%s" }, + { NVMF_OPT_TRSVCID, "trsvcid=%s" }, + { NVMF_OPT_NQN, "nqn=%s" }, + { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" }, + { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" }, + { NVMF_OPT_TL_RETRY_COUNT, "tl_retry_count=%d" }, + { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, + { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, + { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_ERR, NULL } +}; + +static int nvmf_parse_options(struct nvmf_ctrl_options *opts, + const char *buf) +{ + substring_t args[MAX_OPT_ARGS]; + char *options, *o, *p; + int token, ret = 0; + size_t nqnlen = 0; + + /* Set defaults */ + opts->queue_size = NVMF_DEF_QUEUE_SIZE; + opts->nr_io_queues = num_online_cpus(); + opts->tl_retry_count = 2; + opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; + + options = o = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + while ((p = strsep(&o, ",\n")) != NULL) { + if (!*p) + continue; + + token = match_token(p, opt_tokens, args); + opts->mask |= token; + switch (token) { + case NVMF_OPT_TRANSPORT: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->transport = p; + break; + case NVMF_OPT_NQN: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->subsysnqn = p; + nqnlen = strlen(opts->subsysnqn); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + opts->subsysnqn, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->discovery_nqn = + !(strcmp(opts->subsysnqn, + NVME_DISC_SUBSYS_NAME)); + if (opts->discovery_nqn) + opts->nr_io_queues = 0; + break; + case NVMF_OPT_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->traddr = p; + break; + case NVMF_OPT_TRSVCID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->trsvcid = p; + break; + case NVMF_OPT_QUEUE_SIZE: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < NVMF_MIN_QUEUE_SIZE || + token > NVMF_MAX_QUEUE_SIZE) { + pr_err("Invalid queue_size %d\n", token); + ret = -EINVAL; + goto out; + } + opts->queue_size = token; + break; + case NVMF_OPT_NR_IO_QUEUES: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid number of IOQs %d\n", token); + ret = -EINVAL; + goto out; + } + opts->nr_io_queues = min_t(unsigned int, + num_online_cpus(), token); + break; + case NVMF_OPT_TL_RETRY_COUNT: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid tl_retry_count %d\n", token); + ret = -EINVAL; + goto out; + } + opts->tl_retry_count = token; + break; + case NVMF_OPT_KATO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (opts->discovery_nqn) { + pr_err("Discovery controllers cannot accept keep_alive_tmo != 0\n"); + ret = -EINVAL; + goto out; + } + + if (token < 0) { + pr_err("Invalid keep_alive_tmo %d\n", token); + ret = -EINVAL; + goto out; + } else if (token == 0) { + /* Allowed for debug */ + pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); + } + opts->kato = token; + break; + case NVMF_OPT_HOSTNQN: + if (opts->host) { + pr_err("hostnqn already user-assigned: %s\n", + opts->host->nqn); + ret = -EADDRINUSE; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + nqnlen = strlen(p); + if (nqnlen >= NVMF_NQN_SIZE) { + pr_err("%s needs to be < %d bytes\n", + p, NVMF_NQN_SIZE); + ret = -EINVAL; + goto out; + } + opts->host = nvmf_host_add(p); + if (!opts->host) { + ret = -ENOMEM; + goto out; + } + break; + case NVMF_OPT_RECONNECT_DELAY: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid reconnect_delay %d\n", token); + ret = -EINVAL; + goto out; + } + opts->reconnect_delay = token; + break; + default: + pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", + p); + ret = -EINVAL; + goto out; + } + } + + if (!opts->host) { + kref_get(&nvmf_default_host->ref); + opts->host = nvmf_default_host; + } + +out: + if (!opts->discovery_nqn && !opts->kato) + opts->kato = NVME_DEFAULT_KATO; + kfree(options); + return ret; +} + +static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts) +{ + if ((opts->mask & required_opts) != required_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if ((opt_tokens[i].token & required_opts) && + !(opt_tokens[i].token & opts->mask)) { + pr_warn("missing parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, + unsigned int allowed_opts) +{ + if (opts->mask & ~allowed_opts) { + int i; + + for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { + if (opt_tokens[i].token & ~allowed_opts) { + pr_warn("invalid parameter '%s'\n", + opt_tokens[i].pattern); + } + } + + return -EINVAL; + } + + return 0; +} + +void nvmf_free_options(struct nvmf_ctrl_options *opts) +{ + nvmf_host_put(opts->host); + kfree(opts->transport); + kfree(opts->traddr); + kfree(opts->trsvcid); + kfree(opts->subsysnqn); + kfree(opts); +} +EXPORT_SYMBOL_GPL(nvmf_free_options); + +#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) +#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + +static struct nvme_ctrl * +nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) +{ + struct nvmf_ctrl_options *opts; + struct nvmf_transport_ops *ops; + struct nvme_ctrl *ctrl; + int ret; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return ERR_PTR(-ENOMEM); + + ret = nvmf_parse_options(opts, buf); + if (ret) + goto out_free_opts; + + /* + * Check the generic options first as we need a valid transport for + * the lookup below. Then clear the generic flags so that transport + * drivers don't have to care about them. + */ + ret = nvmf_check_required_opts(opts, NVMF_REQUIRED_OPTS); + if (ret) + goto out_free_opts; + opts->mask &= ~NVMF_REQUIRED_OPTS; + + mutex_lock(&nvmf_transports_mutex); + ops = nvmf_lookup_transport(opts); + if (!ops) { + pr_info("no handler found for transport %s.\n", + opts->transport); + ret = -EINVAL; + goto out_unlock; + } + + ret = nvmf_check_required_opts(opts, ops->required_opts); + if (ret) + goto out_unlock; + ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS | + ops->allowed_opts | ops->required_opts); + if (ret) + goto out_unlock; + + ctrl = ops->create_ctrl(dev, opts); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + mutex_unlock(&nvmf_transports_mutex); + return ctrl; + +out_unlock: + mutex_unlock(&nvmf_transports_mutex); +out_free_opts: + nvmf_host_put(opts->host); + kfree(opts); + return ERR_PTR(ret); +} + +static struct class *nvmf_class; +static struct device *nvmf_device; +static DEFINE_MUTEX(nvmf_dev_mutex); + +static ssize_t nvmf_dev_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *pos) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl; + const char *buf; + int ret = 0; + + if (count > PAGE_SIZE) + return -ENOMEM; + + buf = memdup_user_nul(ubuf, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + mutex_lock(&nvmf_dev_mutex); + if (seq_file->private) { + ret = -EINVAL; + goto out_unlock; + } + + ctrl = nvmf_create_ctrl(nvmf_device, buf, count); + if (IS_ERR(ctrl)) { + ret = PTR_ERR(ctrl); + goto out_unlock; + } + + seq_file->private = ctrl; + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + kfree(buf); + return ret ? ret : count; +} + +static int nvmf_dev_show(struct seq_file *seq_file, void *private) +{ + struct nvme_ctrl *ctrl; + int ret = 0; + + mutex_lock(&nvmf_dev_mutex); + ctrl = seq_file->private; + if (!ctrl) { + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(seq_file, "instance=%d,cntlid=%d\n", + ctrl->instance, ctrl->cntlid); + +out_unlock: + mutex_unlock(&nvmf_dev_mutex); + return ret; +} + +static int nvmf_dev_open(struct inode *inode, struct file *file) +{ + /* + * The miscdevice code initializes file->private_data, but doesn't + * make use of it later. + */ + file->private_data = NULL; + return single_open(file, nvmf_dev_show, NULL); +} + +static int nvmf_dev_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq_file = file->private_data; + struct nvme_ctrl *ctrl = seq_file->private; + + if (ctrl) + nvme_put_ctrl(ctrl); + return single_release(inode, file); +} + +static const struct file_operations nvmf_dev_fops = { + .owner = THIS_MODULE, + .write = nvmf_dev_write, + .read = seq_read, + .open = nvmf_dev_open, + .release = nvmf_dev_release, +}; + +static struct miscdevice nvmf_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "nvme-fabrics", + .fops = &nvmf_dev_fops, +}; + +static int __init nvmf_init(void) +{ + int ret; + + nvmf_default_host = nvmf_host_default(); + if (!nvmf_default_host) + return -ENOMEM; + + nvmf_class = class_create(THIS_MODULE, "nvme-fabrics"); + if (IS_ERR(nvmf_class)) { + pr_err("couldn't register class nvme-fabrics\n"); + ret = PTR_ERR(nvmf_class); + goto out_free_host; + } + + nvmf_device = + device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(nvmf_device)) { + pr_err("couldn't create nvme-fabris device!\n"); + ret = PTR_ERR(nvmf_device); + goto out_destroy_class; + } + + ret = misc_register(&nvmf_misc); + if (ret) { + pr_err("couldn't register misc device: %d\n", ret); + goto out_destroy_device; + } + + return 0; + +out_destroy_device: + device_destroy(nvmf_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(nvmf_class); +out_free_host: + nvmf_host_put(nvmf_default_host); + return ret; +} + +static void __exit nvmf_exit(void) +{ + misc_deregister(&nvmf_misc); + device_destroy(nvmf_class, MKDEV(0, 0)); + class_destroy(nvmf_class); + nvmf_host_put(nvmf_default_host); + + BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); +} + +MODULE_LICENSE("GPL v2"); + +module_init(nvmf_init); +module_exit(nvmf_exit); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h new file mode 100644 index 000000000000..b54067404963 --- /dev/null +++ b/drivers/nvme/host/fabrics.h @@ -0,0 +1,135 @@ +/* + * NVMe over Fabrics common host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#ifndef _NVME_FABRICS_H +#define _NVME_FABRICS_H 1 + +#include <linux/in.h> +#include <linux/inet.h> + +#define NVMF_MIN_QUEUE_SIZE 16 +#define NVMF_MAX_QUEUE_SIZE 1024 +#define NVMF_DEF_QUEUE_SIZE 128 +#define NVMF_DEF_RECONNECT_DELAY 10 + +/* + * Define a host as seen by the target. We allocate one at boot, but also + * allow the override it when creating controllers. This is both to provide + * persistence of the Host NQN over multiple boots, and to allow using + * multiple ones, for example in a container scenario. Because we must not + * use different Host NQNs with the same Host ID we generate a Host ID and + * use this structure to keep track of the relation between the two. + */ +struct nvmf_host { + struct kref ref; + struct list_head list; + char nqn[NVMF_NQN_SIZE]; + uuid_le id; +}; + +/** + * enum nvmf_parsing_opts - used to define the sysfs parsing options used. + */ +enum { + NVMF_OPT_ERR = 0, + NVMF_OPT_TRANSPORT = 1 << 0, + NVMF_OPT_NQN = 1 << 1, + NVMF_OPT_TRADDR = 1 << 2, + NVMF_OPT_TRSVCID = 1 << 3, + NVMF_OPT_QUEUE_SIZE = 1 << 4, + NVMF_OPT_NR_IO_QUEUES = 1 << 5, + NVMF_OPT_TL_RETRY_COUNT = 1 << 6, + NVMF_OPT_KATO = 1 << 7, + NVMF_OPT_HOSTNQN = 1 << 8, + NVMF_OPT_RECONNECT_DELAY = 1 << 9, +}; + +/** + * struct nvmf_ctrl_options - Used to hold the options specified + * with the parsing opts enum. + * @mask: Used by the fabrics library to parse through sysfs options + * on adding a NVMe controller. + * @transport: Holds the fabric transport "technology name" (for a lack of + * better description) that will be used by an NVMe controller + * being added. + * @subsysnqn: Hold the fully qualified NQN subystem name (format defined + * in the NVMe specification, "NVMe Qualified Names"). + * @traddr: network address that will be used by the host to communicate + * to the added NVMe controller. + * @trsvcid: network port used for host-controller communication. + * @queue_size: Number of IO queue elements. + * @nr_io_queues: Number of controller IO queues that will be established. + * @tl_retry_count: Number of transport layer retries for a fabric queue before + * kicking upper layer(s) error recovery. + * @reconnect_delay: Time between two consecutive reconnect attempts. + * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. + * @kato: Keep-alive timeout. + * @host: Virtual NVMe host, contains the NQN and Host ID. + */ +struct nvmf_ctrl_options { + unsigned mask; + char *transport; + char *subsysnqn; + char *traddr; + char *trsvcid; + size_t queue_size; + unsigned int nr_io_queues; + unsigned short tl_retry_count; + unsigned int reconnect_delay; + bool discovery_nqn; + unsigned int kato; + struct nvmf_host *host; +}; + +/* + * struct nvmf_transport_ops - used to register a specific + * fabric implementation of NVMe fabrics. + * @entry: Used by the fabrics library to add the new + * registration entry to its linked-list internal tree. + * @name: Name of the NVMe fabric driver implementation. + * @required_opts: sysfs command-line options that must be specified + * when adding a new NVMe controller. + * @allowed_opts: sysfs command-line options that can be specified + * when adding a new NVMe controller. + * @create_ctrl(): function pointer that points to a non-NVMe + * implementation-specific fabric technology + * that would go into starting up that fabric + * for the purpose of conneciton to an NVMe controller + * using that fabric technology. + * + * Notes: + * 1. At minimum, 'required_opts' and 'allowed_opts' should + * be set to the same enum parsing options defined earlier. + * 2. create_ctrl() must be defined (even if it does nothing) + */ +struct nvmf_transport_ops { + struct list_head entry; + const char *name; + int required_opts; + int allowed_opts; + struct nvme_ctrl *(*create_ctrl)(struct device *dev, + struct nvmf_ctrl_options *opts); +}; + +int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); +int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); +int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); +int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); +int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); +void nvmf_register_transport(struct nvmf_transport_ops *ops); +void nvmf_unregister_transport(struct nvmf_transport_ops *ops); +void nvmf_free_options(struct nvmf_ctrl_options *opts); +const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); +int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); + +#endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index a0af0558354c..97fe6109c98f 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -156,7 +156,7 @@ struct nvme_nvm_completion { #define NVME_NVM_LP_MLC_PAIRS 886 struct nvme_nvm_lp_mlc { - __u16 num_pairs; + __le16 num_pairs; __u8 pairs[NVME_NVM_LP_MLC_PAIRS]; }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 282421fec27c..abe83b43a71a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -38,6 +38,9 @@ extern unsigned char admin_timeout; extern unsigned char shutdown_timeout; #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) +#define NVME_DEFAULT_KATO 5 +#define NVME_KATO_GRACE 10 + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -71,6 +74,7 @@ enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, NVME_CTRL_RESETTING, + NVME_CTRL_RECONNECTING, NVME_CTRL_DELETING, NVME_CTRL_DEAD, }; @@ -80,6 +84,7 @@ struct nvme_ctrl { spinlock_t lock; const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; + struct request_queue *connect_q; struct device *dev; struct kref kref; int instance; @@ -107,10 +112,22 @@ struct nvme_ctrl { u8 event_limit; u8 vwc; u32 vs; + u32 sgls; + u16 kas; + unsigned int kato; bool subsystem; unsigned long quirks; struct work_struct scan_work; struct work_struct async_event_work; + struct delayed_work ka_work; + + /* Fabrics only */ + u16 sqsize; + u32 ioccsz; + u32 iorcsz; + u16 icdoff; + u16 maxcmd; + struct nvmf_ctrl_options *opts; }; /* @@ -144,7 +161,9 @@ struct nvme_ns { }; struct nvme_ctrl_ops { + const char *name; struct module *module; + bool is_fabrics; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -152,6 +171,9 @@ struct nvme_ctrl_ops { void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*post_scan)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); + int (*delete_ctrl)(struct nvme_ctrl *ctrl); + const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); + int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -231,8 +253,9 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); +#define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags); + struct nvme_command *cmd, unsigned int flags, int qid); void nvme_requeue_req(struct request *req); int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); @@ -240,7 +263,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct nvme_completion *cqe, void *buffer, unsigned bufflen, - unsigned timeout); + unsigned timeout, int qid, int at_head, int flags); int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); @@ -257,6 +280,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); +void nvme_start_keep_alive(struct nvme_ctrl *ctrl); +void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); struct sg_io_hdr; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index dc39924362a3..79a4f56c06cd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -520,8 +520,8 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_unmap; } - cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); return BLK_MQ_RQ_QUEUE_OK; @@ -901,7 +901,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT); + BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; @@ -1512,7 +1512,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(req)) return PTR_ERR(req); @@ -1873,6 +1873,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .name = "pcie", .module = THIS_MODULE, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c new file mode 100644 index 000000000000..278551bcd5c7 --- /dev/null +++ b/drivers/nvme/host/rdma.c @@ -0,0 +1,2021 @@ +/* + * NVMe over Fabrics RDMA host code. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/jiffies.h> +#include <linux/atomic.h> +#include <linux/blk-mq.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> +#include <linux/nvme.h> +#include <linux/t10-pi.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/ib_cm.h> +#include <linux/nvme-rdma.h> + +#include "nvme.h" +#include "fabrics.h" + + +#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */ + +#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ + +#define NVME_RDMA_MAX_SEGMENTS 256 + +#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 + +#define NVME_RDMA_MAX_PAGES_PER_MR 512 + +#define NVME_RDMA_DEF_RECONNECT_DELAY 20 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_RDMA_NR_AEN_COMMANDS 1 +#define NVME_RDMA_AQ_BLKMQ_DEPTH \ + (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + +struct nvme_rdma_device { + struct ib_device *dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct kref ref; + struct list_head entry; +}; + +struct nvme_rdma_qe { + struct ib_cqe cqe; + void *data; + u64 dma; +}; + +struct nvme_rdma_queue; +struct nvme_rdma_request { + struct ib_mr *mr; + struct nvme_rdma_qe sqe; + struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS]; + u32 num_sge; + int nents; + bool inline_data; + bool need_inval; + struct ib_reg_wr reg_wr; + struct ib_cqe reg_cqe; + struct nvme_rdma_queue *queue; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +enum nvme_rdma_queue_flags { + NVME_RDMA_Q_CONNECTED = (1 << 0), +}; + +struct nvme_rdma_queue { + struct nvme_rdma_qe *rsp_ring; + u8 sig_count; + int queue_size; + size_t cmnd_capsule_len; + struct nvme_rdma_ctrl *ctrl; + struct nvme_rdma_device *device; + struct ib_cq *ib_cq; + struct ib_qp *qp; + + unsigned long flags; + struct rdma_cm_id *cm_id; + int cm_error; + struct completion cm_done; +}; + +struct nvme_rdma_ctrl { + /* read and written in the hot path */ + spinlock_t lock; + + /* read only in the hot path */ + struct nvme_rdma_queue *queues; + u32 queue_count; + + /* other member variables */ + unsigned short tl_retry_count; + struct blk_mq_tag_set tag_set; + struct work_struct delete_work; + struct work_struct reset_work; + struct work_struct err_work; + + struct nvme_rdma_qe async_event_sqe; + + int reconnect_delay; + struct delayed_work reconnect_work; + + struct list_head list; + + struct blk_mq_tag_set admin_tag_set; + struct nvme_rdma_device *device; + + u64 cap; + u32 max_fr_pages; + + union { + struct sockaddr addr; + struct sockaddr_in addr_in; + }; + + struct nvme_ctrl ctrl; +}; + +static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_rdma_ctrl, ctrl); +} + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static LIST_HEAD(nvme_rdma_ctrl_list); +static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); + +static struct workqueue_struct *nvme_rdma_wq; + +/* + * Disabling this option makes small I/O goes faster, but is fundamentally + * unsafe. With it turned off we will have to register a global rkey that + * allows read and write access to all physical memory. + */ +static bool register_always = true; +module_param(register_always, bool, 0444); +MODULE_PARM_DESC(register_always, + "Use memory registration even for contiguous memory regions"); + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl); + +/* XXX: really should move to a generic header sooner or later.. */ +static inline void put_unaligned_le24(u32 val, u8 *p) +{ + *p++ = val; + *p++ = val >> 8; + *p++ = val >> 16; +} + +static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} + +static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir); + kfree(qe->data); +} + +static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, + size_t capsule_size, enum dma_data_direction dir) +{ + qe->data = kzalloc(capsule_size, GFP_KERNEL); + if (!qe->data) + return -ENOMEM; + + qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); + if (ib_dma_mapping_error(ibdev, qe->dma)) { + kfree(qe->data); + return -ENOMEM; + } + + return 0; +} + +static void nvme_rdma_free_ring(struct ib_device *ibdev, + struct nvme_rdma_qe *ring, size_t ib_queue_size, + size_t capsule_size, enum dma_data_direction dir) +{ + int i; + + for (i = 0; i < ib_queue_size; i++) + nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir); + kfree(ring); +} + +static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, + size_t ib_queue_size, size_t capsule_size, + enum dma_data_direction dir) +{ + struct nvme_rdma_qe *ring; + int i; + + ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL); + if (!ring) + return NULL; + + for (i = 0; i < ib_queue_size; i++) { + if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) + goto out_free_ring; + } + + return ring; + +out_free_ring: + nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir); + return NULL; +} + +static void nvme_rdma_qp_event(struct ib_event *event, void *context) +{ + pr_debug("QP event %d\n", event->event); +} + +static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) +{ + wait_for_completion_interruptible_timeout(&queue->cm_done, + msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); + return queue->cm_error; +} + +static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_qp_init_attr init_attr; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.event_handler = nvme_rdma_qp_event; + /* +1 for drain */ + init_attr.cap.max_send_wr = factor * queue->queue_size + 1; + /* +1 for drain */ + init_attr.cap.max_recv_wr = queue->queue_size + 1; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = queue->ib_cq; + init_attr.recv_cq = queue->ib_cq; + + ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr); + + queue->qp = queue->cm_id->qp; + return ret; +} + +static int nvme_rdma_reinit_request(void *data, struct request *rq) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_device *dev = ctrl->device; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int ret = 0; + + if (!req->need_inval) + goto out; + + ib_dereg_mr(req->mr); + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + req->mr = NULL; + ret = PTR_ERR(req->mr); + } + + req->need_inval = false; + +out: + return ret; +} + +static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + + if (req->mr) + ib_dereg_mr(req->mr); + + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); +} + +static void nvme_rdma_exit_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, hctx_idx + 1); +} + +static void nvme_rdma_exit_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx) +{ + return __nvme_rdma_exit_request(data, rq, 0); +} + +static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, + struct request *rq, unsigned int queue_idx) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int ret; + + BUG_ON(queue_idx >= ctrl->queue_count); + + ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (ret) + return ret; + + req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, + ctrl->max_fr_pages); + if (IS_ERR(req->mr)) { + ret = PTR_ERR(req->mr); + goto out_free_qe; + } + + req->queue = queue; + + return 0; + +out_free_qe: + nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return -ENOMEM; +} + +static int nvme_rdma_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, hctx_idx + 1); +} + +static int nvme_rdma_init_admin_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return __nvme_rdma_init_request(data, rq, 0); +} + +static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_rdma_ctrl *ctrl = data; + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static void nvme_rdma_free_dev(struct kref *ref) +{ + struct nvme_rdma_device *ndev = + container_of(ref, struct nvme_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + if (!register_always) + ib_dereg_mr(ndev->mr); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static void nvme_rdma_dev_put(struct nvme_rdma_device *dev) +{ + kref_put(&dev->ref, nvme_rdma_free_dev); +} + +static int nvme_rdma_dev_get(struct nvme_rdma_device *dev) +{ + return kref_get_unless_zero(&dev->ref); +} + +static struct nvme_rdma_device * +nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvme_rdma_device *ndev; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->dev->node_guid == cm_id->device->node_guid && + nvme_rdma_dev_get(ndev)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->dev = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->dev); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (!register_always) { + ndev->mr = ib_get_dma_mr(ndev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(ndev->mr)) + goto out_free_pd; + } + + if (!(ndev->dev->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS)) { + dev_err(&ndev->dev->dev, + "Memory registrations not supported.\n"); + goto out_free_mr; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + return ndev; + +out_free_mr: + if (!register_always) + ib_dereg_mr(ndev->mr); +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->ib_cq); + + nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + + nvme_rdma_dev_put(dev); +} + +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, + struct nvme_rdma_device *dev) +{ + struct ib_device *ibdev = dev->dev; + const int send_wr_factor = 3; /* MR, SEND, INV */ + const int cq_factor = send_wr_factor + 1; /* + RECV */ + int comp_vector, idx = nvme_rdma_queue_idx(queue); + + int ret; + + queue->device = dev; + + /* + * The admin queue is barely used once the controller is live, so don't + * bother to spread it out. + */ + if (idx == 0) + comp_vector = 0; + else + comp_vector = idx % ibdev->num_comp_vectors; + + + /* +1 for ib_stop_cq */ + queue->ib_cq = ib_alloc_cq(dev->dev, queue, + cq_factor * queue->queue_size + 1, comp_vector, + IB_POLL_SOFTIRQ); + if (IS_ERR(queue->ib_cq)) { + ret = PTR_ERR(queue->ib_cq); + goto out; + } + + ret = nvme_rdma_create_qp(queue, send_wr_factor); + if (ret) + goto out_destroy_ib_cq; + + queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size, + sizeof(struct nvme_completion), DMA_FROM_DEVICE); + if (!queue->rsp_ring) { + ret = -ENOMEM; + goto out_destroy_qp; + } + + return 0; + +out_destroy_qp: + ib_destroy_qp(queue->qp); +out_destroy_ib_cq: + ib_free_cq(queue->ib_cq); +out: + return ret; +} + +static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, + int idx, size_t queue_size) +{ + struct nvme_rdma_queue *queue; + int ret; + + queue = &ctrl->queues[idx]; + queue->ctrl = ctrl; + init_completion(&queue->cm_done); + + if (idx > 0) + queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + else + queue->cmnd_capsule_len = sizeof(struct nvme_command); + + queue->queue_size = queue_size; + + queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(queue->cm_id)) { + dev_info(ctrl->ctrl.device, + "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); + return PTR_ERR(queue->cm_id); + } + + queue->cm_error = -ETIMEDOUT; + ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr, + NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + ret = nvme_rdma_wait_for_cm(queue); + if (ret) { + dev_info(ctrl->ctrl.device, + "rdma_resolve_addr wait failed (%d).\n", ret); + goto out_destroy_cm_id; + } + + set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); + + return 0; + +out_destroy_cm_id: + rdma_destroy_id(queue->cm_id); + return ret; +} + +static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + +static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) +{ + nvme_rdma_destroy_queue_ib(queue); + rdma_destroy_id(queue->cm_id); +} + +static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue) +{ + if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) + return; + nvme_rdma_stop_queue(queue); + nvme_rdma_free_queue(queue); +} + +static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); +} + +static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret = 0; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + break; + } + + return ret; +} + +static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.sqsize); + if (ret) { + dev_info(ctrl->ctrl.device, + "failed to initialize i/o queue: %d\n", ret); + goto out_free_queues; + } + } + + return 0; + +out_free_queues: + for (; i >= 1; i--) + nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); + + return ret; +} + +static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvme_rdma_dev_put(ctrl->device); +} + +static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + if (ctrl->ctrl.tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + nvme_rdma_dev_put(ctrl->device); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + bool changed; + int ret; + + if (ctrl->queue_count > 1) { + nvme_rdma_free_io_queues(ctrl); + + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto requeue; + } + + nvme_rdma_stop_and_free_queue(&ctrl->queues[0]); + + ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set); + if (ret) + goto requeue; + + ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (ret) + goto requeue; + + blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + + ret = nvmf_connect_admin_queue(&ctrl->ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (ret) + goto stop_admin_q; + + nvme_start_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto stop_admin_q; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto stop_admin_q; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) + nvme_start_queues(&ctrl->ctrl); + + dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + + return; + +stop_admin_q: + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); +requeue: + /* Make sure we are not resetting/deleting */ + if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) { + dev_info(ctrl->ctrl.device, + "Failed reconnect attempt, requeueing...\n"); + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); + } +} + +static void nvme_rdma_error_recovery_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, err_work); + + nvme_stop_keep_alive(&ctrl->ctrl); + if (ctrl->queue_count > 1) + nvme_stop_queues(&ctrl->ctrl); + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + + /* We must take care of fastfail/requeue all our inflight requests */ + if (ctrl->queue_count > 1) + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + + dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n", + ctrl->reconnect_delay); + + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->reconnect_delay * HZ); +} + +static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + return; + + queue_work(nvme_rdma_wq, &ctrl->err_work); +} + +static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, + const char *op) +{ + struct nvme_rdma_queue *queue = cq->cq_context; + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + dev_info(ctrl->ctrl.device, + "%s for CQE 0x%p failed with status %s (%d)\n", + op, wc->wr_cqe, + ib_wc_status_msg(wc->status), wc->status); + nvme_rdma_error_recovery(ctrl); +} + +static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "MEMREG"); +} + +static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "LOCAL_INV"); +} + +static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req) +{ + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .next = NULL, + .num_sge = 0, + .send_flags = 0, + .ex.invalidate_rkey = req->mr->rkey, + }; + + req->reg_cqe.done = nvme_rdma_inv_rkey_done; + wr.wr_cqe = &req->reg_cqe; + + return ib_post_send(queue->qp, &wr, &bad_wr); +} + +static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, + struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int res; + + if (!blk_rq_bytes(rq)) + return; + + if (req->need_inval) { + res = nvme_rdma_inv_rkey(queue, req); + if (res < 0) { + dev_err(ctrl->ctrl.device, + "Queueing INV WR for rkey %#x failed (%d)\n", + req->mr->rkey, res); + nvme_rdma_error_recovery(queue->ctrl); + } + } + + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + nvme_cleanup_cmd(rq); + sg_free_table_chained(&req->sg_table, true); +} + +static int nvme_rdma_set_sg_null(struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = 0; + put_unaligned_le24(0, sg->length); + put_unaligned_le32(0, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + req->sge[1].addr = sg_dma_address(req->sg_table.sgl); + req->sge[1].length = sg_dma_len(req->sg_table.sgl); + req->sge[1].lkey = queue->device->pd->local_dma_lkey; + + sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); + sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; + + req->inline_data = true; + req->num_sge++; + return 0; +} + +static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + + sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl)); + put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length); + put_unaligned_le32(queue->device->mr->rkey, sg->key); + sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4; + return 0; +} + +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, + struct nvme_rdma_request *req, struct nvme_command *c, + int count) +{ + struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl; + int nr; + + nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE); + if (nr < count) { + if (nr < 0) + return nr; + return -EINVAL; + } + + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + req->reg_cqe.done = nvme_rdma_memreg_done; + memset(&req->reg_wr, 0, sizeof(req->reg_wr)); + req->reg_wr.wr.opcode = IB_WR_REG_MR; + req->reg_wr.wr.wr_cqe = &req->reg_cqe; + req->reg_wr.wr.num_sge = 0; + req->reg_wr.mr = req->mr; + req->reg_wr.key = req->mr->rkey; + req->reg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE; + + req->need_inval = true; + + sg->addr = cpu_to_le64(req->mr->iova); + put_unaligned_le24(req->mr->length, sg->length); + put_unaligned_le32(req->mr->rkey, sg->key); + sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_INVALIDATE; + + return 0; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, unsigned int map_len, + struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int nents, count; + int ret; + + req->num_sge = 1; + req->inline_data = false; + req->need_inval = false; + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_bytes(rq)) + return nvme_rdma_set_sg_null(c); + + req->sg_table.sgl = req->first_sgl; + ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments, + req->sg_table.sgl); + if (ret) + return -ENOMEM; + + nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); + BUG_ON(nents > rq->nr_phys_segments); + req->nents = nents; + + count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents, + rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (unlikely(count <= 0)) { + sg_free_table_chained(&req->sg_table, true); + return -EIO; + } + + if (count == 1) { + if (rq_data_dir(rq) == WRITE && + map_len <= nvme_rdma_inline_data_size(queue) && + nvme_rdma_queue_idx(queue)) + return nvme_rdma_map_sg_inline(queue, req, c); + + if (!register_always) + return nvme_rdma_map_sg_single(queue, req, c); + } + + return nvme_rdma_map_sg_fr(queue, req, c, count); +} + +static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + if (unlikely(wc->status != IB_WC_SUCCESS)) + nvme_rdma_wr_error(cq, wc, "SEND"); +} + +static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, + struct ib_send_wr *first, bool flush) +{ + struct ib_send_wr wr, *bad_wr; + int ret; + + sge->addr = qe->dma; + sge->length = sizeof(struct nvme_command), + sge->lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_send_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = sge; + wr.num_sge = num_sge; + wr.opcode = IB_WR_SEND; + wr.send_flags = 0; + + /* + * Unsignalled send completions are another giant desaster in the + * IB Verbs spec: If we don't regularly post signalled sends + * the send queue will fill up and only a QP reset will rescue us. + * Would have been way to obvious to handle this in hardware or + * at least the RDMA stack.. + * + * This messy and racy code sniplet is copy and pasted from the iSER + * initiator, and the magic '32' comes from there as well. + * + * Always signal the flushes. The magic request used for the flush + * sequencer is not allocated in our driver's tagset and it's + * triggered to be freed by blk_cleanup_queue(). So we need to + * always mark it as signaled to ensure that the "wr_cqe", which is + * embeded in request's payload, is not freed when __ib_process_cq() + * calls wr_cqe->done(). + */ + if ((++queue->sig_count % 32) == 0 || flush) + wr.send_flags |= IB_SEND_SIGNALED; + + if (first) + first->next = ≀ + else + first = ≀ + + ret = ib_post_send(queue->qp, first, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue, + struct nvme_rdma_qe *qe) +{ + struct ib_recv_wr wr, *bad_wr; + struct ib_sge list; + int ret; + + list.addr = qe->dma; + list.length = sizeof(struct nvme_completion); + list.lkey = queue->device->pd->local_dma_lkey; + + qe->cqe.done = nvme_rdma_recv_done; + + wr.next = NULL; + wr.wr_cqe = &qe->cqe; + wr.sg_list = &list; + wr.num_sge = 1; + + ret = ib_post_recv(queue->qp, &wr, &bad_wr); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "%s failed with error code %d\n", __func__, ret); + } + return ret; +} + +static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) +{ + u32 queue_idx = nvme_rdma_queue_idx(queue); + + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; +} + +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); + struct nvme_rdma_queue *queue = &ctrl->queues[0]; + struct ib_device *dev = queue->device->dev; + struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe; + struct nvme_command *cmd = sqe->data; + struct ib_sge sge; + int ret; + + if (WARN_ON_ONCE(aer_idx != 0)) + return; + + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); + + memset(cmd, 0, sizeof(*cmd)); + cmd->common.opcode = nvme_admin_async_event; + cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.flags |= NVME_CMD_SGL_METABUF; + nvme_rdma_set_sg_null(cmd); + + ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd), + DMA_TO_DEVICE); + + ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false); + WARN_ON_ONCE(ret); +} + +static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, + struct nvme_completion *cqe, struct ib_wc *wc, int tag) +{ + u16 status = le16_to_cpu(cqe->status); + struct request *rq; + struct nvme_rdma_request *req; + int ret = 0; + + status >>= 1; + + rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "tag 0x%x on QP %#x not found\n", + cqe->command_id, queue->qp->qp_num); + nvme_rdma_error_recovery(queue->ctrl); + return ret; + } + req = blk_mq_rq_to_pdu(rq); + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special) + memcpy(rq->special, cqe, sizeof(*cqe)); + + if (rq->tag == tag) + ret = 1; + + if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) && + wc->ex.invalidate_rkey == req->mr->rkey) + req->need_inval = false; + + blk_mq_complete_request(rq, status); + + return ret; +} + +static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) +{ + struct nvme_rdma_qe *qe = + container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); + struct nvme_rdma_queue *queue = cq->cq_context; + struct ib_device *ibdev = queue->device->dev; + struct nvme_completion *cqe = qe->data; + const size_t len = sizeof(struct nvme_completion); + int ret = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvme_rdma_wr_error(cq, wc, "RECV"); + return 0; + } + + ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_rdma_queue_idx(queue) == 0 && + cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + nvme_complete_async_event(&queue->ctrl->ctrl, cqe); + else + ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); + ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); + + nvme_rdma_post_recv(queue, qe); + return ret; +} + +static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + __nvme_rdma_recv_done(cq, wc, -1); +} + +static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) +{ + int ret, i; + + for (i = 0; i < queue->queue_size; i++) { + ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]); + if (ret) + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, + struct rdma_cm_event *ev) +{ + if (ev->param.conn.private_data_len) { + struct nvme_rdma_cm_rej *rej = + (struct nvme_rdma_cm_rej *)ev->param.conn.private_data; + + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, status %d.", le16_to_cpu(rej->sts)); + /* XXX: Think of something clever to do here... */ + } else { + dev_err(queue->ctrl->ctrl.device, + "Connect rejected, no private data.\n"); + } + + return -ECONNRESET; +} + +static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_device *dev; + int ret; + + dev = nvme_rdma_find_get_device(queue->cm_id); + if (!dev) { + dev_err(queue->cm_id->device->dma_device, + "no client data found!\n"); + return -ECONNREFUSED; + } + + ret = nvme_rdma_create_queue_ib(queue, dev); + if (ret) { + nvme_rdma_dev_put(dev); + goto out; + } + + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); + if (ret) { + dev_err(queue->ctrl->ctrl.device, + "rdma_resolve_route failed (%d).\n", + queue->cm_error); + goto out_destroy_queue; + } + + return 0; + +out_destroy_queue: + nvme_rdma_destroy_queue_ib(queue); +out: + return ret; +} + +static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_req priv; + int ret; + + param.qp_num = queue->qp->qp_num; + param.flow_control = 1; + + param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom; + /* rdma_cm will clamp down to max QP retry count (7) */ + param.retry_count = ctrl->tl_retry_count; + param.rnr_retry_count = 7; + param.private_data = &priv; + param.private_data_len = sizeof(priv); + + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue)); + priv.hrqsize = cpu_to_le16(queue->queue_size); + priv.hsqsize = cpu_to_le16(queue->queue_size); + + ret = rdma_connect(queue->cm_id, ¶m); + if (ret) { + dev_err(ctrl->ctrl.device, + "rdma_connect failed (%d).\n", ret); + goto out_destroy_queue_ib; + } + + return 0; + +out_destroy_queue_ib: + nvme_rdma_destroy_queue_ib(queue); + return ret; +} + +/** + * nvme_rdma_device_unplug() - Handle RDMA device unplug + * @queue: Queue that owns the cm_id that caught the event + * + * DEVICE_REMOVAL event notifies us that the RDMA device is about + * to unplug so we should take care of destroying our RDMA resources. + * This event will be generated for each allocated cm_id. + * + * In our case, the RDMA resources are managed per controller and not + * only per queue. So the way we handle this is we trigger an implicit + * controller deletion upon the first DEVICE_REMOVAL event we see, and + * hold the event inflight until the controller deletion is completed. + * + * One exception that we need to handle is the destruction of the cm_id + * that caught the event. Since we hold the callout until the controller + * deletion is completed, we'll deadlock if the controller deletion will + * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership + * of destroying this queue before-hand, destroy the queue resources + * after the controller deletion completed with the exception of destroying + * the cm_id implicitely by returning a non-zero rc to the callout. + */ +static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue) +{ + struct nvme_rdma_ctrl *ctrl = queue->ctrl; + int ret, ctrl_deleted = 0; + + /* First disable the queue so ctrl delete won't free it */ + if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) + goto out; + + /* delete the controller */ + ret = __nvme_rdma_del_ctrl(ctrl); + if (!ret) { + dev_warn(ctrl->ctrl.device, + "Got rdma device removal event, deleting ctrl\n"); + flush_work(&ctrl->delete_work); + + /* Return non-zero so the cm_id will destroy implicitly */ + ctrl_deleted = 1; + + /* Free this queue ourselves */ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); + nvme_rdma_destroy_queue_ib(queue); + } + +out: + return ctrl_deleted; +} + +static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct nvme_rdma_queue *queue = cm_id->context; + int cm_error = 0; + + dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n", + rdma_event_msg(ev->event), ev->event, + ev->status, cm_id); + + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cm_error = nvme_rdma_addr_resolved(queue); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cm_error = nvme_rdma_route_resolved(queue); + break; + case RDMA_CM_EVENT_ESTABLISHED: + queue->cm_error = nvme_rdma_conn_established(queue); + /* complete cm_done regardless of success/failure */ + complete(&queue->cm_done); + return 0; + case RDMA_CM_EVENT_REJECTED: + cm_error = nvme_rdma_conn_rejected(queue, ev); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + dev_dbg(queue->ctrl->ctrl.device, + "CM error event %d\n", ev->event); + cm_error = -ECONNRESET; + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + dev_dbg(queue->ctrl->ctrl.device, + "disconnect received - connection closed\n"); + nvme_rdma_error_recovery(queue->ctrl); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* return 1 means impliciy CM ID destroy */ + return nvme_rdma_device_unplug(queue); + default: + dev_err(queue->ctrl->ctrl.device, + "Unexpected RDMA CM event (%d)\n", ev->event); + nvme_rdma_error_recovery(queue->ctrl); + break; + } + + if (cm_error) { + queue->cm_error = cm_error; + complete(&queue->cm_done); + } + + return 0; +} + +static enum blk_eh_timer_return +nvme_rdma_timeout(struct request *rq, bool reserved) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + nvme_rdma_error_recovery(req->queue->ctrl); + + /* fail with DNR on cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_rdma_queue *queue = hctx->driver_data; + struct request *rq = bd->rq; + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_qe *sqe = &req->sqe; + struct nvme_command *c = sqe->data; + bool flush = false; + struct ib_device *dev; + unsigned int map_len; + int ret; + + WARN_ON_ONCE(rq->tag < 0); + + dev = queue->device->dev; + ib_dma_sync_single_for_cpu(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + ret = nvme_setup_cmd(ns, rq, c); + if (ret) + return ret; + + c->common.command_id = rq->tag; + blk_mq_start_request(rq); + + map_len = nvme_map_len(rq); + ret = nvme_rdma_map_data(queue, rq, map_len, c); + if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "Failed to map data (%d)\n", ret); + nvme_cleanup_cmd(rq); + goto err; + } + + ib_dma_sync_single_for_device(dev, sqe->dma, + sizeof(struct nvme_command), DMA_TO_DEVICE); + + if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH) + flush = true; + ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + req->need_inval ? &req->reg_wr.wr : NULL, flush); + if (ret) { + nvme_rdma_unmap_data(queue, rq); + goto err; + } + + return BLK_MQ_RQ_QUEUE_OK; +err: + return (ret == -ENOMEM || ret == -EAGAIN) ? + BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; +} + +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) +{ + struct nvme_rdma_queue *queue = hctx->driver_data; + struct ib_cq *cq = queue->ib_cq; + struct ib_wc wc; + int found = 0; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + struct ib_cqe *cqe = wc.wr_cqe; + + if (cqe) { + if (cqe->done == nvme_rdma_recv_done) + found |= __nvme_rdma_recv_done(cq, &wc, tag); + else + cqe->done(cq, &wc); + } + } + + return found; +} + +static void nvme_rdma_complete_rq(struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + int error = 0; + + nvme_rdma_unmap_data(queue, rq); + + if (unlikely(rq->errors)) { + if (nvme_req_needs_retry(rq, rq->errors)) { + nvme_requeue_req(rq); + return; + } + + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) + error = rq->errors; + else + error = nvme_error_status(rq->errors); + } + + blk_mq_end_request(rq, error); +} + +static struct blk_mq_ops nvme_rdma_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_hctx, + .poll = nvme_rdma_poll, + .timeout = nvme_rdma_timeout, +}; + +static struct blk_mq_ops nvme_rdma_admin_mq_ops = { + .queue_rq = nvme_rdma_queue_rq, + .complete = nvme_rdma_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_rdma_init_admin_request, + .exit_request = nvme_rdma_exit_admin_request, + .reinit_request = nvme_rdma_reinit_request, + .init_hctx = nvme_rdma_init_admin_hctx, + .timeout = nvme_rdma_timeout, +}; + +static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) +{ + int error; + + error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + if (error) + return error; + + ctrl->device = ctrl->queues[0].device; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + error = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_queue; + + ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS, + ctrl->device->dev->attrs.max_fast_reg_page_list_len); + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_put_dev; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, + &ctrl->async_event_sqe, sizeof(struct nvme_command), + DMA_TO_DEVICE); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + /* disconnect and drain the queue before freeing the tagset */ + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_queue: + nvme_rdma_free_queue(&ctrl->queues[0]); + return error; +} + +static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + nvme_stop_keep_alive(&ctrl->ctrl); + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->reconnect_work); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_free_io_queues(ctrl); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +} + +static void nvme_rdma_del_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + int ret; + + ret = __nvme_rdma_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_rdma_remove_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static void nvme_rdma_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(work, + struct nvme_rdma_ctrl, reset_work); + int ret; + bool changed; + + nvme_rdma_shutdown_ctrl(ctrl); + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) { + /* ctrl is already shutdown, just remove the ctrl */ + INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work); + goto del_dead_ctrl; + } + + if (ctrl->queue_count > 1) { + ret = blk_mq_reinit_tagset(&ctrl->tag_set); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto del_dead_ctrl; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + if (ctrl->queue_count > 1) { + nvme_start_queues(&ctrl->ctrl); + nvme_queue_scan(&ctrl->ctrl); + } + + return; + +del_dead_ctrl: + /* Deleting this dead controller... */ + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); +} + +static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { + .name = "rdma", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_rdma_reset_ctrl, + .free_ctrl = nvme_rdma_free_ctrl, + .submit_async_event = nvme_rdma_submit_async_event, + .delete_ctrl = nvme_rdma_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, + .get_address = nvmf_get_address, +}; + +static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret) + return ret; + + ctrl->queue_count = opts->nr_io_queues + 1; + if (ctrl->queue_count < 2) + return 0; + + dev_info(ctrl->ctrl.device, + "creating %d I/O queues.\n", opts->nr_io_queues); + + ret = nvme_rdma_init_io_queues(ctrl); + if (ret) + return ret; + + /* + * We need a reference on the device as long as the tag_set is alive, + * as the MRs in the request structures need a valid ib_device. + */ + ret = -EINVAL; + if (!nvme_rdma_dev_get(ctrl->device)) + goto out_free_io_queues; + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_rdma_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_put_dev; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tag_set; + } + + ret = nvme_rdma_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tag_set: + blk_mq_free_tag_set(&ctrl->tag_set); +out_put_dev: + nvme_rdma_dev_put(ctrl->device); +out_free_io_queues: + nvme_rdma_free_io_queues(ctrl); + return ret; +} + +static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p) +{ + u8 *addr = (u8 *)&in_addr->sin_addr.s_addr; + size_t buflen = strlen(p); + + /* XXX: handle IPv6 addresses */ + + if (buflen > INET_ADDRSTRLEN) + return -EINVAL; + if (in4_pton(p, buflen, addr, '\0', NULL) == 0) + return -EINVAL; + in_addr->sin_family = AF_INET; + return 0; +} + +static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + int ret; + bool changed; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr); + if (ret) { + pr_err("malformed IP address passed: %s\n", opts->traddr); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_TRSVCID) { + u16 port; + + ret = kstrtou16(opts->trsvcid, 0, &port); + if (ret) + goto out_free_ctrl; + + ctrl->addr_in.sin_port = cpu_to_be16(port); + } else { + ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT); + } + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_free_ctrl; + + ctrl->reconnect_delay = opts->reconnect_delay; + INIT_DELAYED_WORK(&ctrl->reconnect_work, + nvme_rdma_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); + INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); + spin_lock_init(&ctrl->lock); + + ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.sqsize = opts->queue_size; + ctrl->tl_retry_count = opts->tl_retry_count; + ctrl->ctrl.kato = opts->kato; + + ret = -ENOMEM; + ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_rdma_configure_admin_queue(ctrl); + if (ret) + goto out_kfree_queues; + + /* sanity check icdoff */ + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto out_remove_admin_queue; + } + + /* sanity check keyed sgls */ + if (!(ctrl->ctrl.sgls & (1 << 20))) { + dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + goto out_remove_admin_queue; + } + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_rdma_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + + kref_get(&ctrl->ctrl.kref); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_stop_keep_alive(&ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl); +out_kfree_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvmf_transport_ops nvme_rdma_transport = { + .name = "rdma", + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_TL_RETRY_COUNT | + NVMF_OPT_RECONNECT_DELAY, + .create_ctrl = nvme_rdma_create_ctrl, +}; + +static int __init nvme_rdma_init_module(void) +{ + nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); + if (!nvme_rdma_wq) + return -ENOMEM; + + nvmf_register_transport(&nvme_rdma_transport); + return 0; +} + +static void __exit nvme_rdma_cleanup_module(void) +{ + struct nvme_rdma_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_rdma_transport); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) + __nvme_rdma_del_ctrl(ctrl); + mutex_unlock(&nvme_rdma_ctrl_mutex); + + destroy_workqueue(nvme_rdma_wq); +} + +module_init(nvme_rdma_init_module); +module_exit(nvme_rdma_cleanup_module); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig new file mode 100644 index 000000000000..0be9e3d4c352 --- /dev/null +++ b/drivers/nvme/target/Kconfig @@ -0,0 +1,37 @@ + +config NVME_TARGET + tristate "NVMe Target support" + depends on BLOCK + depends on CONFIGFS_FS + help + This enabled target side support for the NVMe protocol, that is + it allows the Linux kernel to implement NVMe subsystems and + controllers and export Linux block devices as NVMe namespaces. + You need to select at least one of the transports below to make this + functionality useful. + + To configure the NVMe target you probably want to use the nvmetcli + tool from http://git.infradead.org/users/hch/nvmetcli.git. + +config NVME_TARGET_LOOP + tristate "NVMe loopback device support" + depends on BLK_DEV_NVME + depends on CONFIGFS_FS + select NVME_TARGET + select NVME_FABRICS + select SG_POOL + help + This enables the NVMe loopback device support, which can be useful + to test NVMe host and target side features. + + If unsure, say N. + +config NVME_TARGET_RDMA + tristate "NVMe over Fabrics RDMA target support" + depends on INFINIBAND + select NVME_TARGET + help + This enables the NVMe RDMA target support, which allows exporting NVMe + devices over RDMA. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile new file mode 100644 index 000000000000..b7a06232c9da --- /dev/null +++ b/drivers/nvme/target/Makefile @@ -0,0 +1,9 @@ + +obj-$(CONFIG_NVME_TARGET) += nvmet.o +obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o + +nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \ + discovery.o +nvme-loop-y += loop.o +nvmet-rdma-y += rdma.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c new file mode 100644 index 000000000000..2fac17a5ad53 --- /dev/null +++ b/drivers/nvme/target/admin-cmd.c @@ -0,0 +1,465 @@ +/* + * NVMe admin command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/random.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +u32 nvmet_get_log_page_len(struct nvme_command *cmd) +{ + u32 len = le16_to_cpu(cmd->get_log_page.numdu); + + len <<= 16; + len += le16_to_cpu(cmd->get_log_page.numdl); + /* NUMD is a 0's based value */ + len += 1; + len *= sizeof(u32); + + return len; +} + +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + size_t data_len = nvmet_get_log_page_len(req->cmd); + void *buf; + u16 status = 0; + + buf = kzalloc(data_len, GFP_KERNEL); + if (!buf) { + status = NVME_SC_INTERNAL; + goto out; + } + + switch (req->cmd->get_log_page.lid) { + case 0x01: + /* + * We currently never set the More bit in the status field, + * so all error log entries are invalid and can be zeroed out. + * This is called a minum viable implementation (TM) of this + * mandatory log page. + */ + break; + case 0x02: + /* + * XXX: fill out actual smart log + * + * We might have a hard time coming up with useful values for + * many of the fields, and even when we have useful data + * available (e.g. units or commands read/written) those aren't + * persistent over power loss. + */ + break; + case 0x03: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + break; + default: + BUG(); + } + + status = nvmet_copy_to_sgl(req, 0, buf, data_len); + + kfree(buf); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u64 serial; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* XXX: figure out how to assign real vendors IDs. */ + id->vid = 0; + id->ssvid = 0; + + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&serial, sizeof(serial)); + memset(id->sn, ' ', sizeof(id->sn)); + snprintf(id->sn, sizeof(id->sn), "%llx", serial); + + memset(id->mn, ' ', sizeof(id->mn)); + strncpy((char *)id->mn, "Linux", sizeof(id->mn)); + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + id->rab = 6; + + /* + * XXX: figure out how we can assign a IEEE OUI, but until then + * the safest is to leave it as zeroes. + */ + + /* we support multiple ports and multiples hosts: */ + id->mic = (1 << 0) | (1 << 1); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* XXX: figure out what to do about RTD3R/RTD3 */ + id->oaes = cpu_to_le32(1 << 8); + id->ctratt = cpu_to_le32(1 << 0); + + id->oacs = 0; + + /* + * We don't really have a practical limit on the number of abort + * comands. But we don't do anything useful for abort either, so + * no point in allowing more abort commands than the spec requires. + */ + id->acl = 3; + + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* first slot is read-only, only one slot supported */ + id->frmw = (1 << 0) | (1 << 1); + id->lpa = (1 << 0) | (1 << 2); + id->elpe = NVMET_ERROR_LOG_SLOTS - 1; + id->npss = 0; + + /* We support keep-alive timeout in granularity of seconds */ + id->kas = cpu_to_le16(NVMET_KAS); + + id->sqes = (0x6 << 4) | 0x6; + id->cqes = (0x4 << 4) | 0x4; + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; + + /* + * We can't support atomic writes bigger than a LBA without support + * from the backend device. + */ + id->awun = 0; + id->awupf = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + /* Max command capsule size is sqe + single page of in-capsule data */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + ctrl->ops->sqe_inline_size) / 16); + /* Max response capsule size is cqe */ + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* + * Meh, we don't really support any power state. Fake up the same + * values that qemu does. + */ + id->psd[0].max_power = cpu_to_le16(0x9c4); + id->psd[0].entry_lat = cpu_to_le32(0x10); + id->psd[0].exit_lat = cpu_to_le32(0x4); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ns(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + struct nvme_id_ns *id; + u16 status = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out_put_ns; + } + + /* + * nuse = ncap = nsze isn't aways true, but we have no way to find + * that out from the underlying device. + */ + id->ncap = id->nuse = id->nsze = + cpu_to_le64(ns->size >> ns->blksize_shift); + + /* + * We just provide a single LBA format that matches what the + * underlying device reports. + */ + id->nlbaf = 0; + id->flbas = 0; + + /* + * Our namespace might always be shared. Not just with other + * controllers, but also with any other user of the block device. + */ + id->nmic = (1 << 0); + + memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + + id->lbaf[0].ds = ns->blksize_shift; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_nslist(struct nvmet_req *req) +{ + static const int buf_size = 4096; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); + __le32 *list; + u16 status = 0; + int i = 0; + + list = kzalloc(buf_size, GFP_KERNEL); + if (!list) { + status = NVME_SC_INTERNAL; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid <= min_nsid) + continue; + list[i++] = cpu_to_le32(ns->nsid); + if (i == buf_size / sizeof(__le32)) + break; + } + rcu_read_unlock(); + + status = nvmet_copy_to_sgl(req, 0, list, buf_size); + + kfree(list); +out: + nvmet_req_complete(req, status); +} + +/* + * A "mimimum viable" abort implementation: the command is mandatory in the + * spec, but we are not required to do any useful work. We couldn't really + * do a useful abort, so don't bother even with waiting for the command + * to be exectuted and return immediately telling the command to abort + * wasn't found. + */ +static void nvmet_execute_abort(struct nvmet_req *req) +{ + nvmet_set_result(req, 1); + nvmet_req_complete(req, 0); +} + +static void nvmet_execute_set_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u64 val; + u32 val32; + u16 status = 0; + + switch (cdw10 & 0xf) { + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); + break; + case NVME_FEAT_KATO: + val = le64_to_cpu(req->cmd->prop_set.value); + val32 = val & 0xffff; + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_set_result(req, req->sq->ctrl->kato); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u16 status = 0; + + switch (cdw10 & 0xf) { + /* + * These features are mandatory in the spec, but we don't + * have a useful way to implement them. We'll eventually + * need to come up with some fake values for these. + */ +#if 0 + case NVME_FEAT_ARBITRATION: + break; + case NVME_FEAT_POWER_MGMT: + break; + case NVME_FEAT_TEMP_THRESH: + break; + case NVME_FEAT_ERR_RECOVERY: + break; + case NVME_FEAT_IRQ_COALESCE: + break; + case NVME_FEAT_IRQ_CONFIG: + break; + case NVME_FEAT_WRITE_ATOMIC: + break; + case NVME_FEAT_ASYNC_EVENT: + break; +#endif + case NVME_FEAT_VOLATILE_WC: + nvmet_set_result(req, 1); + break; + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); + break; + case NVME_FEAT_KATO: + nvmet_set_result(req, req->sq->ctrl->kato * 1000); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_async_event(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + return; + } + ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +static void nvmet_execute_keep_alive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + pr_debug("ctrl %d update keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvmet_req_complete(req, 0); +} + +int nvmet_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got admin cmd %d while CC.EN == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case 0x01: + case 0x02: + case 0x03: + req->execute = nvmet_execute_get_log_page; + return 0; + } + break; + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x00: + req->execute = nvmet_execute_identify_ns; + return 0; + case 0x01: + req->execute = nvmet_execute_identify_ctrl; + return 0; + case 0x02: + req->execute = nvmet_execute_identify_nslist; + return 0; + } + break; + case nvme_admin_abort_cmd: + req->execute = nvmet_execute_abort; + req->data_len = 0; + return 0; + case nvme_admin_set_features: + req->execute = nvmet_execute_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c new file mode 100644 index 000000000000..af5e2dc4a3d5 --- /dev/null +++ b/drivers/nvme/target/configfs.c @@ -0,0 +1,917 @@ +/* + * Configfs interface for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/ctype.h> + +#include "nvmet.h" + +static struct config_item_type nvmet_host_type; +static struct config_item_type nvmet_subsys_type; + +/* + * nvmet_port Generic ConfigFS definitions. + * Used in any place in the ConfigFS tree that refers to an address. + */ +static ssize_t nvmet_addr_adrfam_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + return sprintf(page, "ipv4\n"); + case NVMF_ADDR_FAMILY_IP6: + return sprintf(page, "ipv6\n"); + case NVMF_ADDR_FAMILY_IB: + return sprintf(page, "ib\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_adrfam_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "ipv4")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4; + } else if (sysfs_streq(page, "ipv6")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6; + } else if (sysfs_streq(page, "ib")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB; + } else { + pr_err("Invalid value '%s' for adrfam\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_adrfam); + +static ssize_t nvmet_addr_portid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + le16_to_cpu(port->disc_addr.portid)); +} + +static ssize_t nvmet_addr_portid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 portid = 0; + + if (kstrtou16(page, 0, &portid)) { + pr_err("Invalid value '%s' for portid\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + port->disc_addr.portid = cpu_to_le16(portid); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_portid); + +static ssize_t nvmet_addr_traddr_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.traddr); +} + +static ssize_t nvmet_addr_traddr_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRADDR_SIZE) { + pr_err("Invalid value '%s' for traddr\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.traddr, + sizeof(port->disc_addr.traddr), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_traddr); + +static ssize_t nvmet_addr_treq_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.treq) { + case NVMF_TREQ_NOT_SPECIFIED: + return sprintf(page, "not specified\n"); + case NVMF_TREQ_REQUIRED: + return sprintf(page, "required\n"); + case NVMF_TREQ_NOT_REQUIRED: + return sprintf(page, "not required\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_treq_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "not specified")) { + port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + } else if (sysfs_streq(page, "required")) { + port->disc_addr.treq = NVMF_TREQ_REQUIRED; + } else if (sysfs_streq(page, "not required")) { + port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + } else { + pr_err("Invalid value '%s' for treq\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_treq); + +static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.trsvcid); +} + +static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRSVCID_SIZE) { + pr_err("Invalid value '%s' for trsvcid\n", page); + return -EINVAL; + } + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.trsvcid, + sizeof(port->disc_addr.trsvcid), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_trsvcid); + +static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.trtype) { + case NVMF_TRTYPE_RDMA: + return sprintf(page, "rdma\n"); + case NVMF_TRTYPE_LOOP: + return sprintf(page, "loop\n"); + default: + return sprintf(page, "\n"); + } +} + +static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_RDMA; + memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE); + port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED; + port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED; + port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; +} + +static void nvmet_port_init_tsas_loop(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_LOOP; + memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE); +} + +static ssize_t nvmet_addr_trtype_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "rdma")) { + nvmet_port_init_tsas_rdma(port); + } else if (sysfs_streq(page, "loop")) { + nvmet_port_init_tsas_loop(port); + } else { + pr_err("Invalid value '%s' for trtype\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trtype); + +/* + * Namespace structures & file operation functions below + */ +static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path); +} + +static ssize_t nvmet_ns_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret; + + mutex_lock(&subsys->lock); + ret = -EBUSY; + if (nvmet_ns_enabled(ns)) + goto out_unlock; + + kfree(ns->device_path); + + ret = -ENOMEM; + ns->device_path = kstrdup(page, GFP_KERNEL); + if (!ns->device_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + return count; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, device_path); + +static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); +} + +static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + u8 nguid[16]; + const char *p = page; + int i; + int ret = 0; + + mutex_lock(&subsys->lock); + if (nvmet_ns_enabled(ns)) { + ret = -EBUSY; + goto out_unlock; + } + + for (i = 0; i < 16; i++) { + if (p + 2 > page + count) { + ret = -EINVAL; + goto out_unlock; + } + if (!isxdigit(p[0]) || !isxdigit(p[1])) { + ret = -EINVAL; + goto out_unlock; + } + + nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]); + p += 2; + + if (*p == '-' || *p == ':') + p++; + } + + memcpy(&ns->nguid, nguid, sizeof(nguid)); +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_nguid); + +static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item))); +} + +static ssize_t nvmet_ns_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_ns_enable(ns); + else + nvmet_ns_disable(ns); + + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, enable); + +static struct configfs_attribute *nvmet_ns_attrs[] = { + &nvmet_ns_attr_device_path, + &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_enable, + NULL, +}; + +static void nvmet_ns_release(struct config_item *item) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + nvmet_ns_free(ns); +} + +static struct configfs_item_operations nvmet_ns_item_ops = { + .release = nvmet_ns_release, +}; + +static struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item); + struct nvmet_ns *ns; + int ret; + u32 nsid; + + ret = kstrtou32(name, 0, &nsid); + if (ret) + goto out; + + ret = -EINVAL; + if (nsid == 0 || nsid == 0xffffffff) + goto out; + + ret = -ENOMEM; + ns = nvmet_ns_alloc(subsys, nsid); + if (!ns) + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + + return &ns->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_namespaces_group_ops = { + .make_group = nvmet_ns_make, +}; + +static struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_port_subsys_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys; + struct nvmet_subsys_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_subsys_type) { + pr_err("can only link subsystems into the subsystems dir.!\n"); + return -EINVAL; + } + subsys = to_subsys(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->subsys = subsys; + + down_write(&nvmet_config_sem); + ret = -EEXIST; + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto out_free_link; + } + + if (list_empty(&port->subsystems)) { + ret = nvmet_enable_port(port); + if (ret) + goto out_free_link; + } + + list_add_tail(&link->entry, &port->subsystems); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; + +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); + struct nvmet_subsys_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + if (list_empty(&port->subsystems)) + nvmet_disable_port(port); + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_port_subsys_item_ops = { + .allow_link = nvmet_port_subsys_allow_link, + .drop_link = nvmet_port_subsys_drop_link, +}; + +static struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_allowed_hosts_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host; + struct nvmet_host_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_host_type) { + pr_err("can only link hosts into the allowed_hosts directory!\n"); + return -EINVAL; + } + + host = to_host(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->host = host; + + down_write(&nvmet_config_sem); + ret = -EINVAL; + if (subsys->allow_any_host) { + pr_err("can't add hosts when allow_any_host is set!\n"); + goto out_free_link; + } + + ret = -EEXIST; + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto out_free_link; + } + list_add_tail(&link->entry, &subsys->hosts); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); + struct nvmet_host_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { + .allow_link = nvmet_allowed_hosts_allow_link, + .drop_link = nvmet_allowed_hosts_drop_link, +}; + +static struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->allow_any_host); +} + +static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool allow_any_host; + int ret = 0; + + if (strtobool(page, &allow_any_host)) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (allow_any_host && !list_empty(&subsys->hosts)) { + pr_err("Can't set allow_any_host when explicit hosts are set!\n"); + ret = -EINVAL; + goto out_unlock; + } + + subsys->allow_any_host = allow_any_host; +out_unlock: + up_write(&nvmet_config_sem); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); + +static struct configfs_attribute *nvmet_subsys_attrs[] = { + &nvmet_subsys_attr_attr_allow_any_host, + NULL, +}; + +/* + * Subsystem structures & folder operation functions below + */ +static void nvmet_subsys_release(struct config_item *item) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + nvmet_subsys_put(subsys); +} + +static struct configfs_item_operations nvmet_subsys_item_ops = { + .release = nvmet_subsys_release, +}; + +static struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_subsys_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys; + + if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) { + pr_err("can't create discovery subsystem through configfs\n"); + return ERR_PTR(-EINVAL); + } + + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); + if (!subsys) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); + + config_group_init_type_name(&subsys->namespaces_group, + "namespaces", &nvmet_namespaces_type); + configfs_add_default_group(&subsys->namespaces_group, &subsys->group); + + config_group_init_type_name(&subsys->allowed_hosts_group, + "allowed_hosts", &nvmet_allowed_hosts_type); + configfs_add_default_group(&subsys->allowed_hosts_group, + &subsys->group); + + return &subsys->group; +} + +static struct configfs_group_operations nvmet_subsystems_group_ops = { + .make_group = nvmet_subsys_make, +}; + +static struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_referral_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled); +} + +static ssize_t nvmet_referral_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (strtobool(page, &enable)) + goto inval; + + if (enable) + nvmet_referral_enable(parent, port); + else + nvmet_referral_disable(port); + + return count; +inval: + pr_err("Invalid value '%s' for enable\n", page); + return -EINVAL; +} + +CONFIGFS_ATTR(nvmet_referral_, enable); + +/* + * Discovery Service subsystem definitions + */ +static struct configfs_attribute *nvmet_referral_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_portid, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_referral_attr_enable, + NULL, +}; + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + nvmet_referral_disable(port); + kfree(port); +} + +static struct configfs_item_operations nvmet_referral_item_ops = { + .release = nvmet_referral_release, +}; + +static struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +}; + +static struct config_group *nvmet_referral_make( + struct config_group *group, const char *name) +{ + struct nvmet_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + config_group_init_type_name(&port->group, name, &nvmet_referral_type); + + return &port->group; +} + +static struct configfs_group_operations nvmet_referral_group_ops = { + .make_group = nvmet_referral_make, +}; + +static struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, +}; + +/* + * Ports definitions. + */ +static void nvmet_port_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + kfree(port); +} + +static struct configfs_attribute *nvmet_port_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + NULL, +}; + +static struct configfs_item_operations nvmet_port_item_ops = { + .release = nvmet_port_release, +}; + +static struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ports_make(struct config_group *group, + const char *name) +{ + struct nvmet_port *port; + u16 portid; + + if (kstrtou16(name, 0, &portid)) + return ERR_PTR(-EINVAL); + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + INIT_LIST_HEAD(&port->subsystems); + INIT_LIST_HEAD(&port->referrals); + + port->disc_addr.portid = cpu_to_le16(portid); + config_group_init_type_name(&port->group, name, &nvmet_port_type); + + config_group_init_type_name(&port->subsys_group, + "subsystems", &nvmet_port_subsys_type); + configfs_add_default_group(&port->subsys_group, &port->group); + + config_group_init_type_name(&port->referrals_group, + "referrals", &nvmet_referrals_type); + configfs_add_default_group(&port->referrals_group, &port->group); + + return &port->group; +} + +static struct configfs_group_operations nvmet_ports_group_ops = { + .make_group = nvmet_ports_make, +}; + +static struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_subsystems_group; +static struct config_group nvmet_ports_group; + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); + + kfree(host); +} + +static struct configfs_item_operations nvmet_host_item_ops = { + .release = nvmet_host_release, +}; + +static struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_hosts_make_group(struct config_group *group, + const char *name) +{ + struct nvmet_host *host; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&host->group, name, &nvmet_host_type); + + return &host->group; +} + +static struct configfs_group_operations nvmet_hosts_group_ops = { + .make_group = nvmet_hosts_make_group, +}; + +static struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_hosts_group; + +static struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nvmet_configfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "nvmet", + .ci_type = &nvmet_root_type, + }, + }, +}; + +int __init nvmet_init_configfs(void) +{ + int ret; + + config_group_init(&nvmet_configfs_subsystem.su_group); + mutex_init(&nvmet_configfs_subsystem.su_mutex); + + config_group_init_type_name(&nvmet_subsystems_group, + "subsystems", &nvmet_subsystems_type); + configfs_add_default_group(&nvmet_subsystems_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_ports_group, + "ports", &nvmet_ports_type); + configfs_add_default_group(&nvmet_ports_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_hosts_group, + "hosts", &nvmet_hosts_type); + configfs_add_default_group(&nvmet_hosts_group, + &nvmet_configfs_subsystem.su_group); + + ret = configfs_register_subsystem(&nvmet_configfs_subsystem); + if (ret) { + pr_err("configfs_register_subsystem: %d\n", ret); + return ret; + } + + return 0; +} + +void __exit nvmet_exit_configfs(void) +{ + configfs_unregister_subsystem(&nvmet_configfs_subsystem); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c new file mode 100644 index 000000000000..e0b3f0166722 --- /dev/null +++ b/drivers/nvme/target/core.c @@ -0,0 +1,964 @@ +/* + * Common code for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include "nvmet.h" + +static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; + +/* + * This read/write semaphore is used to synchronize access to configuration + * information on a target system that will result in discovery log page + * information change for at least one host. + * The full list of resources to protected by this semaphore is: + * + * - subsystems list + * - per-subsystem allowed hosts list + * - allow_any_host subsystem attribute + * - nvmet_genctr + * - the nvmet_transports array + * + * When updating any of those lists/structures write lock should be obtained, + * while when reading (popolating discovery log page or checking host-subsystem + * link) read lock is obtained to allow concurrent reads. + */ +DECLARE_RWSEM(nvmet_config_sem); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) +{ + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) +{ + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +static u32 nvmet_async_event_result(struct nvmet_async_event *aen) +{ + return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + if (!ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + } +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + struct nvmet_async_event *aen; + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + aen = list_first_entry_or_null(&ctrl->async_events, + struct nvmet_async_event, entry); + if (!aen || !ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + nvmet_set_result(req, nvmet_async_event_result(aen)); + + list_del(&aen->entry); + kfree(aen); + + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, 0); + } +} + +static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page) +{ + struct nvmet_async_event *aen; + + aen = kmalloc(sizeof(*aen), GFP_KERNEL); + if (!aen) + return; + + aen->event_type = event_type; + aen->event_info = event_info; + aen->log_page = log_page; + + mutex_lock(&ctrl->lock); + list_add_tail(&aen->entry, &ctrl->async_events); + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops) +{ + int ret = 0; + + down_write(&nvmet_config_sem); + if (nvmet_transports[ops->type]) + ret = -EINVAL; + else + nvmet_transports[ops->type] = ops; + up_write(&nvmet_config_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_register_transport); + +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops) +{ + down_write(&nvmet_config_sem); + nvmet_transports[ops->type] = NULL; + up_write(&nvmet_config_sem); +} +EXPORT_SYMBOL_GPL(nvmet_unregister_transport); + +int nvmet_enable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + int ret; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + up_write(&nvmet_config_sem); + request_module("nvmet-transport-%d", port->disc_addr.trtype); + down_write(&nvmet_config_sem); + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + pr_err("transport type %d not supported\n", + port->disc_addr.trtype); + return -EINVAL; + } + } + + if (!try_module_get(ops->owner)) + return -EINVAL; + + ret = ops->add_port(port); + if (ret) { + module_put(ops->owner); + return ret; + } + + port->enabled = true; + return 0; +} + +void nvmet_disable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + + lockdep_assert_held(&nvmet_config_sem); + + port->enabled = false; + + ops = nvmet_transports[port->disc_addr.trtype]; + ops->remove_port(port); + module_put(ops->owner); +} + +static void nvmet_keep_alive_timer(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvmet_ctrl, ka_work); + + pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", + ctrl->cntlid, ctrl->kato); + + ctrl->ops->delete_ctrl(ctrl); +} + +static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d start keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); + + cancel_delayed_work_sync(&ctrl->ka_work); +} + +static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, + __le32 nsid) +{ + struct nvmet_ns *ns; + + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid == le32_to_cpu(nsid)) + return ns; + } + + return NULL; +} + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + struct nvmet_ns *ns; + + rcu_read_lock(); + ns = __nvmet_find_namespace(ctrl, nsid); + if (ns) + percpu_ref_get(&ns->ref); + rcu_read_unlock(); + + return ns; +} + +static void nvmet_destroy_namespace(struct percpu_ref *ref) +{ + struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); + + complete(&ns->disable_done); +} + +void nvmet_put_namespace(struct nvmet_ns *ns) +{ + percpu_ref_put(&ns->ref); +} + +int nvmet_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + int ret = 0; + + mutex_lock(&subsys->lock); + if (!list_empty(&ns->dev_link)) + goto out_unlock; + + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, + NULL); + if (IS_ERR(ns->bdev)) { + pr_err("nvmet: failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + ret = PTR_ERR(ns->bdev); + ns->bdev = NULL; + goto out_unlock; + } + + ns->size = i_size_read(ns->bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, + 0, GFP_KERNEL); + if (ret) + goto out_blkdev_put; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = ns->nsid; + + /* + * The namespaces list needs to be sorted to simplify the implementation + * of the Identify Namepace List subcommand. + */ + if (list_empty(&subsys->namespaces)) { + list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); + } else { + struct nvmet_ns *old; + + list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) { + BUG_ON(ns->nsid == old->nsid); + if (ns->nsid < old->nsid) + break; + } + + list_add_tail_rcu(&ns->dev_link, &old->dev_link); + } + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + ret = 0; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +out_blkdev_put: + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + ns->bdev = NULL; + goto out_unlock; +} + +void nvmet_ns_disable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + if (list_empty(&ns->dev_link)) { + mutex_unlock(&subsys->lock); + return; + } + list_del_init(&ns->dev_link); + mutex_unlock(&subsys->lock); + + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + if (ns->bdev) + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + nvmet_ns_disable(ns); + + kfree(ns->device_path); + kfree(ns); +} + +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ns *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + + INIT_LIST_HEAD(&ns->dev_link); + init_completion(&ns->disable_done); + + ns->nsid = nsid; + ns->subsys = subsys; + + return ns; +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (status) + nvmet_set_status(req, status); + + /* XXX: need to fill in something useful for sq_head */ + req->rsp->sq_head = 0; + if (likely(req->sq)) /* may happen during early failure */ + req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->rsp->command_id = req->cmd->common.command_id; + + if (req->ns) + nvmet_put_namespace(req->ns); + req->ops->queue_response(req); +} + +void nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + __nvmet_req_complete(req, status); + percpu_ref_put(&req->sq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_req_complete); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + cq->qid = qid; + cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 qid, u16 size) +{ + sq->qid = qid; + sq->size = size; + + ctrl->sqs[qid] = sq; +} + +void nvmet_sq_destroy(struct nvmet_sq *sq) +{ + /* + * If this is the admin queue, complete all AERs so that our + * queue doesn't have outstanding requests on it. + */ + if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq) + nvmet_async_events_free(sq->ctrl); + percpu_ref_kill(&sq->ref); + wait_for_completion(&sq->free_done); + percpu_ref_exit(&sq->ref); + + if (sq->ctrl) { + nvmet_ctrl_put(sq->ctrl); + sq->ctrl = NULL; /* allows reusing the queue later */ + } +} +EXPORT_SYMBOL_GPL(nvmet_sq_destroy); + +static void nvmet_sq_free(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->free_done); +} + +int nvmet_sq_init(struct nvmet_sq *sq) +{ + int ret; + + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&sq->free_done); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_sq_init); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops) +{ + u8 flags = req->cmd->common.flags; + u16 status; + + req->cq = cq; + req->sq = sq; + req->ops = ops; + req->sg = NULL; + req->sg_cnt = 0; + req->rsp->status = 0; + + /* no support for fused commands yet */ + if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + /* either variant of SGLs is fine, as we don't support metadata */ + if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF && + (flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (unlikely(!req->sq->ctrl)) + /* will return an error for any Non-connect command: */ + status = nvmet_parse_connect_cmd(req); + else if (likely(req->sq->qid != 0)) + status = nvmet_parse_io_cmd(req); + else if (req->cmd->common.opcode == nvme_fabrics_command) + status = nvmet_parse_fabrics_cmd(req); + else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + status = nvmet_parse_discovery_cmd(req); + else + status = nvmet_parse_admin_cmd(req); + + if (status) + goto fail; + + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + return true; + +fail: + __nvmet_req_complete(req, status); + return false; +} +EXPORT_SYMBOL_GPL(nvmet_req_init); + +static inline bool nvmet_cc_en(u32 cc) +{ + return cc & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> 4) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> 7) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> 11) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> 14) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> 16) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> 20) & 0xf; +} + +static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || + nvmet_cc_mps(ctrl->cc) != 0 || + nvmet_cc_ams(ctrl->cc) != 0 || + nvmet_cc_css(ctrl->cc) != 0) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + ctrl->csts = NVME_CSTS_RDY; +} + +static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* XXX: tear down queues? */ + ctrl->csts &= ~NVME_CSTS_RDY; + ctrl->cc = 0; +} + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) +{ + u32 old; + + mutex_lock(&ctrl->lock); + old = ctrl->cc; + ctrl->cc = new; + + if (nvmet_cc_en(new) && !nvmet_cc_en(old)) + nvmet_start_ctrl(ctrl); + if (!nvmet_cc_en(new) && nvmet_cc_en(old)) + nvmet_clear_ctrl(ctrl); + if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { + nvmet_clear_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + mutex_unlock(&ctrl->lock); +} + +static void nvmet_init_cap(struct nvmet_ctrl *ctrl) +{ + /* command sets supported: NVMe command set: */ + ctrl->cap = (1ULL << 37); + /* CC.EN timeout in 500msec units: */ + ctrl->cap |= (15ULL << 24); + /* maximum queue entries supported: */ + ctrl->cap |= NVMET_QUEUE_SIZE - 1; +} + +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + u16 status = 0; + + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->cntlid == cntlid) { + if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { + pr_warn("hostnqn mismatch.\n"); + continue; + } + if (!kref_get_unless_zero(&ctrl->ref)) + continue; + + *ret = ctrl; + goto out; + } + } + + pr_warn("could not find controller %d for subsys %s / host %s\n", + cntlid, subsysnqn, hostnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + +out: + mutex_unlock(&subsys->lock); + nvmet_subsys_put(subsys); + return status; +} + +static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, + const char *hostnqn) +{ + struct nvmet_host_link *p; + + if (subsys->allow_any_host) + return true; + + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), hostnqn)) + return true; + } + + return false; +} + +static bool nvmet_host_discovery_allowed(struct nvmet_req *req, + const char *hostnqn) +{ + struct nvmet_subsys_link *s; + + list_for_each_entry(s, &req->port->subsystems, entry) { + if (__nvmet_host_allowed(s->subsys, hostnqn)) + return true; + } + + return false; +} + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn) +{ + lockdep_assert_held(&nvmet_config_sem); + + if (subsys->type == NVME_NQN_DISC) + return nvmet_host_discovery_allowed(req, hostnqn); + else + return __nvmet_host_allowed(subsys, hostnqn); +} + +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int ret; + u16 status; + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; + } + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + down_read(&nvmet_config_sem); + if (!nvmet_host_allowed(req, subsys, hostnqn)) { + pr_info("connect by host %s for subsystem %s not allowed\n", + hostnqn, subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn); + up_read(&nvmet_config_sem); + goto out_put_subsystem; + } + up_read(&nvmet_config_sem); + + status = NVME_SC_INTERNAL; + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + goto out_put_subsystem; + mutex_init(&ctrl->lock); + + nvmet_init_cap(ctrl); + + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); + INIT_LIST_HEAD(&ctrl->async_events); + + memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + + kref_init(&ctrl->ref); + ctrl->subsys = subsys; + + ctrl->cqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_ctrl; + + ctrl->sqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_sq *), + GFP_KERNEL); + if (!ctrl->sqs) + goto out_free_cqs; + + ret = ida_simple_get(&subsys->cntlid_ida, + NVME_CNTLID_MIN, NVME_CNTLID_MAX, + GFP_KERNEL); + if (ret < 0) { + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + goto out_free_sqs; + } + ctrl->cntlid = ret; + + ctrl->ops = req->ops; + if (ctrl->subsys->type == NVME_NQN_DISC) { + /* Don't accept keep-alive timeout for discovery controllers */ + if (kato) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_free_sqs; + } + + /* + * Discovery controllers use some arbitrary high value in order + * to cleanup stale discovery sessions + * + * From the latest base diff RC: + * "The Keep Alive command is not supported by + * Discovery controllers. A transport may specify a + * fixed Discovery controller activity timeout value + * (e.g., 2 minutes). If no commands are received + * by a Discovery controller within that time + * period, the controller may perform the + * actions for Keep Alive Timer expiration". + */ + ctrl->kato = NVMET_DISC_KATO; + } else { + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + } + nvmet_start_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + *ctrlp = ctrl; + return 0; + +out_free_sqs: + kfree(ctrl->sqs); +out_free_cqs: + kfree(ctrl->cqs); +out_free_ctrl: + kfree(ctrl); +out_put_subsystem: + nvmet_subsys_put(subsys); +out: + return status; +} + +static void nvmet_ctrl_free(struct kref *ref) +{ + struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); + struct nvmet_subsys *subsys = ctrl->subsys; + + nvmet_stop_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + + ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid); + nvmet_subsys_put(subsys); + + kfree(ctrl->sqs); + kfree(ctrl->cqs); + kfree(ctrl); +} + +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvmet_ctrl_free); +} + +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) +{ + ctrl->csts |= NVME_CSTS_CFS; + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + schedule_work(&ctrl->fatal_err_work); +} +EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn) +{ + struct nvmet_subsys_link *p; + + if (!port) + return NULL; + + if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) + return NULL; + return nvmet_disc_subsys; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (!strncmp(p->subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&p->subsys->ref)) + break; + up_read(&nvmet_config_sem); + return p->subsys; + } + } + up_read(&nvmet_config_sem); + return NULL; +} + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type) +{ + struct nvmet_subsys *subsys; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return NULL; + + subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + + switch (type) { + case NVME_NQN_NVME: + subsys->max_qid = NVMET_NR_QUEUES; + break; + case NVME_NQN_DISC: + subsys->max_qid = 0; + break; + default: + pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); + kfree(subsys); + return NULL; + } + subsys->type = type; + subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, + GFP_KERNEL); + if (IS_ERR(subsys->subsysnqn)) { + kfree(subsys); + return NULL; + } + + kref_init(&subsys->ref); + + mutex_init(&subsys->lock); + INIT_LIST_HEAD(&subsys->namespaces); + INIT_LIST_HEAD(&subsys->ctrls); + + ida_init(&subsys->cntlid_ida); + + INIT_LIST_HEAD(&subsys->hosts); + + return subsys; +} + +static void nvmet_subsys_free(struct kref *ref) +{ + struct nvmet_subsys *subsys = + container_of(ref, struct nvmet_subsys, ref); + + WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + + ida_destroy(&subsys->cntlid_ida); + kfree(subsys->subsysnqn); + kfree(subsys); +} + +void nvmet_subsys_put(struct nvmet_subsys *subsys) +{ + kref_put(&subsys->ref, nvmet_subsys_free); +} + +static int __init nvmet_init(void) +{ + int error; + + error = nvmet_init_discovery(); + if (error) + goto out; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_discovery; + return 0; + +out_exit_discovery: + nvmet_exit_discovery(); +out: + return error; +} + +static void __exit nvmet_exit(void) +{ + nvmet_exit_configfs(); + nvmet_exit_discovery(); + + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); +} + +module_init(nvmet_init); +module_exit(nvmet_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c new file mode 100644 index 000000000000..6f65646e89cf --- /dev/null +++ b/drivers/nvme/target/discovery.c @@ -0,0 +1,221 @@ +/* + * Discovery service for the NVMe over Fabrics target. + * Copyright (C) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/slab.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +struct nvmet_subsys *nvmet_disc_subsys; + +u64 nvmet_genctr; + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (list_empty(&port->entry)) { + list_add_tail(&port->entry, &parent->referrals); + port->enabled = true; + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +void nvmet_referral_disable(struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (!list_empty(&port->entry)) { + port->enabled = false; + list_del_init(&port->entry); + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, + struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec) +{ + struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec]; + + e->trtype = port->disc_addr.trtype; + e->adrfam = port->disc_addr.adrfam; + e->treq = port->disc_addr.treq; + e->portid = port->disc_addr.portid; + /* we support only dynamic controllers */ + e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); + e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->nqntype = type; + memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); + memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); + memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); +} + +static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) +{ + const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_disc_rsp_page_hdr *hdr; + size_t data_len = nvmet_get_log_page_len(req->cmd); + size_t alloc_len = max(data_len, sizeof(*hdr)); + int residual_len = data_len - sizeof(*hdr); + struct nvmet_subsys_link *p; + struct nvmet_port *r; + u32 numrec = 0; + u16 status = 0; + + /* + * Make sure we're passing at least a buffer of response header size. + * If host provided data len is less than the header size, only the + * number of bytes requested by host will be sent to host. + */ + hdr = kzalloc(alloc_len, GFP_KERNEL); + if (!hdr) { + status = NVME_SC_INTERNAL; + goto out; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + continue; + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, + NVME_NQN_NVME, numrec); + residual_len -= entry_size; + } + numrec++; + } + + list_for_each_entry(r, &req->port->referrals, entry) { + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + NVME_NQN_DISC, numrec); + residual_len -= entry_size; + } + numrec++; + } + + hdr->genctr = cpu_to_le64(nvmet_genctr); + hdr->numrec = cpu_to_le64(numrec); + hdr->recfmt = cpu_to_le16(0); + + up_read(&nvmet_config_sem); + + status = nvmet_copy_to_sgl(req, 0, hdr, data_len); + kfree(hdr); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + id->lpa = (1 << 2); + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +int nvmet_parse_discovery_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got cmd %d while not ready\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case NVME_LOG_DISC: + req->execute = nvmet_execute_get_disc_log_page; + return 0; + default: + pr_err("nvmet: unsupported get_log_page lid %d\n", + cmd->get_log_page.lid); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x01: + req->execute = + nvmet_execute_identify_disc_ctrl; + return 0; + default: + pr_err("nvmet: unsupported identify cns %d\n", + le32_to_cpu(cmd->identify.cns)); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + default: + pr_err("nvmet: unsupported cmd %d\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + +int __init nvmet_init_discovery(void) +{ + nvmet_disc_subsys = + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); + if (!nvmet_disc_subsys) + return -ENOMEM; + return 0; +} + +void nvmet_exit_discovery(void) +{ + nvmet_subsys_put(nvmet_disc_subsys); +} diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c new file mode 100644 index 000000000000..9a97ae67e656 --- /dev/null +++ b/drivers/nvme/target/fabrics-cmd.c @@ -0,0 +1,240 @@ +/* + * NVMe Fabrics command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include "nvmet.h" + +static void nvmet_execute_prop_set(struct nvmet_req *req) +{ + u16 status = 0; + + if (!(req->cmd->prop_set.attrib & 1)) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); + + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_prop_get(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + u64 val = 0; + + if (req->cmd->prop_get.attrib & 1) { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_CAP: + val = ctrl->cap; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_VS: + val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + val = ctrl->cc; + break; + case NVME_REG_CSTS: + val = ctrl->csts; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } + + req->rsp->result64 = cpu_to_le64(val); + nvmet_req_complete(req, status); +} + +int nvmet_parse_fabrics_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + switch (cmd->fabrics.fctype) { + case nvme_fabrics_type_property_set: + req->data_len = 0; + req->execute = nvmet_execute_prop_set; + break; + case nvme_fabrics_type_property_get: + req->data_len = 0; + req->execute = nvmet_execute_prop_get; + break; + default: + pr_err("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + +static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + u16 qid = le16_to_cpu(c->qid); + u16 sqsize = le16_to_cpu(c->sqsize); + struct nvmet_ctrl *old; + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + + nvmet_cq_setup(ctrl, req->cq, qid, sqsize); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + return 0; +} + +static void nvmet_execute_admin_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { + pr_warn("connect attempt for invalid controller ID %#x\n", + d->cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + goto out; + } + + status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, + le32_to_cpu(c->kato), &ctrl); + if (status) + goto out; + + status = nvmet_install_queue(ctrl, req); + if (status) { + nvmet_ctrl_put(ctrl); + goto out; + } + + pr_info("creating controller %d for NQN %s.\n", + ctrl->cntlid, ctrl->hostnqn); + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); +} + +static void nvmet_execute_io_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 qid = le16_to_cpu(c->qid); + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), + req, &ctrl); + if (status) + goto out; + + if (unlikely(qid > ctrl->subsys->max_qid)) { + pr_warn("invalid queue id (%d)\n", qid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_SQE(qid); + goto out_ctrl_put; + } + + status = nvmet_install_queue(ctrl, req); + if (status) { + /* pass back cntlid that had the issue of installing queue */ + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + goto out_ctrl_put; + } + + pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); + return; + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + goto out; +} + +int nvmet_parse_connect_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (req->cmd->common.opcode != nvme_fabrics_command) { + pr_err("invalid command 0x%x on unconnected queue.\n", + cmd->fabrics.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { + pr_err("invalid capsule type 0x%x on unconnected queue.\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + req->data_len = sizeof(struct nvmf_connect_data); + if (cmd->connect.qid == 0) + req->execute = nvmet_execute_admin_connect; + else + req->execute = nvmet_execute_io_connect; + return 0; +} diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c new file mode 100644 index 000000000000..2cd069b691ae --- /dev/null +++ b/drivers/nvme/target/io-cmd.c @@ -0,0 +1,215 @@ +/* + * NVMe I/O command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include <linux/module.h> +#include "nvmet.h" + +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + nvmet_req_complete(req, + bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + + if (bio != &req->inline_bio) + bio_put(bio); +} + +static inline u32 nvmet_rw_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} + +static void nvmet_inline_bio_init(struct nvmet_req *req) +{ + struct bio *bio = &req->inline_bio; + + bio_init(bio); + bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC; + bio->bi_io_vec = req->inline_bvec; +} + +static void nvmet_execute_rw(struct nvmet_req *req) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + struct bio *bio; + sector_t sector; + blk_qc_t cookie; + int op, op_flags = 0, i; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { + op = REQ_OP_WRITE; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } else { + op = REQ_OP_READ; + } + + sector = le64_to_cpu(req->cmd->rw.slba); + sector <<= (req->ns->blksize_shift - 9); + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) + != sg->length) { + struct bio *prev = bio; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio_set_op_attrs(bio, op, op_flags); + + bio_chain(bio, prev); + cookie = submit_bio(prev); + } + + sector += sg->length >> 9; + sg_cnt--; + } + + cookie = submit_bio(bio); + + blk_poll(bdev_get_queue(req->ns->bdev), cookie); +} + +static void nvmet_execute_flush(struct nvmet_req *req) +{ + struct bio *bio; + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + + bio->bi_bdev = req->ns->bdev; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + + submit_bio(bio); +} + +static u16 nvmet_discard_range(struct nvmet_ns *ns, + struct nvme_dsm_range *range, struct bio **bio) +{ + if (__blkdev_issue_discard(ns->bdev, + le64_to_cpu(range->slba) << (ns->blksize_shift - 9), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static void nvmet_execute_discard(struct nvmet_req *req) +{ + struct nvme_dsm_range range; + struct bio *bio = NULL; + int i; + u16 status; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + status = nvmet_discard_range(req->ns, &range, &bio); + if (status) + break; + } + + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + if (status) { + bio->bi_error = -EIO; + bio_endio(bio); + } else { + submit_bio(bio); + } + } else { + nvmet_req_complete(req, status); + } +} + +static void nvmet_execute_dsm(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +int nvmet_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got io cmd %d while CC.EN == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); + if (!req->ns) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + switch (cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_execute_rw; + req->data_len = nvmet_rw_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_execute_flush; + req->data_len = 0; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_execute_dsm; + req->data_len = le32_to_cpu(cmd->dsm.nr) * + sizeof(struct nvme_dsm_range); + return 0; + default: + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c new file mode 100644 index 000000000000..94e782987cc9 --- /dev/null +++ b/drivers/nvme/target/loop.c @@ -0,0 +1,754 @@ +/* + * NVMe over Fabrics loopback device. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/scatterlist.h> +#include <linux/delay.h> +#include <linux/blk-mq.h> +#include <linux/nvme.h> +#include <linux/module.h> +#include <linux/parser.h> +#include <linux/t10-pi.h> +#include "nvmet.h" +#include "../host/nvme.h" +#include "../host/fabrics.h" + +#define NVME_LOOP_AQ_DEPTH 256 + +#define NVME_LOOP_MAX_SEGMENTS 256 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_LOOP_NR_AEN_COMMANDS 1 +#define NVME_LOOP_AQ_BLKMQ_DEPTH \ + (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + +struct nvme_loop_iod { + struct nvme_command cmd; + struct nvme_completion rsp; + struct nvmet_req req; + struct nvme_loop_queue *queue; + struct work_struct work; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +struct nvme_loop_ctrl { + spinlock_t lock; + struct nvme_loop_queue *queues; + u32 queue_count; + + struct blk_mq_tag_set admin_tag_set; + + struct list_head list; + u64 cap; + struct blk_mq_tag_set tag_set; + struct nvme_loop_iod async_event_iod; + struct nvme_ctrl ctrl; + + struct nvmet_ctrl *target_ctrl; + struct work_struct delete_work; + struct work_struct reset_work; +}; + +static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_loop_ctrl, ctrl); +} + +struct nvme_loop_queue { + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvme_loop_ctrl *ctrl; +}; + +static struct nvmet_port *nvmet_loop_port; + +static LIST_HEAD(nvme_loop_ctrl_list); +static DEFINE_MUTEX(nvme_loop_ctrl_mutex); + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req); +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl); + +static struct nvmet_fabrics_ops nvme_loop_ops; + +static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static void nvme_loop_complete_rq(struct request *req) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int error = 0; + + nvme_cleanup_cmd(req); + sg_free_table_chained(&iod->sg_table, true); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + blk_mq_end_request(req, error); +} + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req) +{ + struct nvme_loop_iod *iod = + container_of(nvme_req, struct nvme_loop_iod, req); + struct nvme_completion *cqe = &iod->rsp; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 && + cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe); + } else { + struct request *req = blk_mq_rq_from_pdu(iod); + + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, cqe, sizeof(*cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1); + } +} + +static void nvme_loop_execute_work(struct work_struct *work) +{ + struct nvme_loop_iod *iod = + container_of(work, struct nvme_loop_iod, work); + + iod->req.execute(&iod->req); +} + +static enum blk_eh_timer_return +nvme_loop_timeout(struct request *rq, bool reserved) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + schedule_work(&iod->queue->ctrl->reset_work); + + /* fail with DNR on admin cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_loop_queue *queue = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int ret; + + ret = nvme_setup_cmd(ns, req, &iod->cmd); + if (ret) + return ret; + + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + iod->req.port = nvmet_loop_port; + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, + &queue->nvme_sq, &nvme_loop_ops)) { + nvme_cleanup_cmd(req); + blk_mq_start_request(req); + nvme_loop_queue_response(&iod->req); + return 0; + } + + if (blk_rq_bytes(req)) { + iod->sg_table.sgl = iod->first_sgl; + ret = sg_alloc_table_chained(&iod->sg_table, + req->nr_phys_segments, iod->sg_table.sgl); + if (ret) + return BLK_MQ_RQ_QUEUE_BUSY; + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); + } + + iod->cmd.common.command_id = req->tag; + blk_mq_start_request(req); + + schedule_work(&iod->work); + return 0; +} + +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); + struct nvme_loop_queue *queue = &ctrl->queues[0]; + struct nvme_loop_iod *iod = &ctrl->async_event_iod; + + memset(&iod->cmd, 0, sizeof(iod->cmd)); + iod->cmd.common.opcode = nvme_admin_async_event; + iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, + &nvme_loop_ops)) { + dev_err(ctrl->ctrl.device, "failed async event work\n"); + return; + } + + schedule_work(&iod->work); +} + +static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl, + struct nvme_loop_iod *iod, unsigned int queue_idx) +{ + BUG_ON(queue_idx >= ctrl->queue_count); + + iod->req.cmd = &iod->cmd; + iod->req.rsp = &iod->rsp; + iod->queue = &ctrl->queues[queue_idx]; + INIT_WORK(&iod->work, nvme_loop_execute_work); + return 0; +} + +static int nvme_loop_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1); +} + +static int nvme_loop_init_admin_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0); +} + +static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static struct blk_mq_ops nvme_loop_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, + .timeout = nvme_loop_timeout, +}; + +static struct blk_mq_ops nvme_loop_admin_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_admin_request, + .init_hctx = nvme_loop_init_admin_hctx, + .timeout = nvme_loop_timeout, +}; + +static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); +} + +static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + int error; + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + if (error) + return error; + ctrl->queue_count = 1; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_free_sq; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_sq: + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + return error; +} + +static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) +{ + int i; + + nvme_stop_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +} + +static void nvme_loop_del_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!schedule_work(&ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + int ret; + + ret = __nvme_loop_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { + if (ctrl->ctrl.cntlid == nctrl->cntlid) + __nvme_loop_del_ctrl(ctrl); + } + mutex_unlock(&nvme_loop_ctrl_mutex); +} + +static void nvme_loop_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, reset_work); + bool changed; + int i, ret; + + nvme_loop_shutdown_ctrl(ctrl); + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_disable; + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_free_queues; + + ctrl->queue_count++; + } + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_free_queues; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + + nvme_start_queues(&ctrl->ctrl); + + return; + +out_free_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvme_loop_destroy_admin_queue(ctrl); +out_disable: + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!schedule_work(&ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { + .name = "loop", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_loop_reset_ctrl, + .free_ctrl = nvme_loop_free_ctrl, + .submit_async_event = nvme_loop_submit_async_event, + .delete_ctrl = nvme_loop_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, +}; + +static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret, i; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret || !opts->nr_io_queues) + return ret; + + dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", + opts->nr_io_queues); + + for (i = 1; i <= opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_destroy_queues; + + ctrl->queue_count++; + } + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_loop_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tagset; + } + + for (i = 1; i <= opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_cleanup_connect_q; + } + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); +out_destroy_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + return ret; +} + +static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_loop_ctrl *ctrl; + bool changed; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_put_ctrl; + + spin_lock_init(&ctrl->lock); + + ret = -ENOMEM; + + ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.kato = opts->kato; + + ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_free_queues; + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_loop_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0); + + dev_info(ctrl->ctrl.device, + "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); + + kref_get(&ctrl->ctrl.kref); + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_loop_destroy_admin_queue(ctrl); +out_free_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +} + +static int nvme_loop_add_port(struct nvmet_port *port) +{ + /* + * XXX: disalow adding more than one port so + * there is no connection rejections when a + * a subsystem is assigned to a port for which + * loop doesn't have a pointer. + * This scenario would be possible if we allowed + * more than one port to be added and a subsystem + * was assigned to a port other than nvmet_loop_port. + */ + + if (nvmet_loop_port) + return -EPERM; + + nvmet_loop_port = port; + return 0; +} + +static void nvme_loop_remove_port(struct nvmet_port *port) +{ + if (port == nvmet_loop_port) + nvmet_loop_port = NULL; +} + +static struct nvmet_fabrics_ops nvme_loop_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_LOOP, + .add_port = nvme_loop_add_port, + .remove_port = nvme_loop_remove_port, + .queue_response = nvme_loop_queue_response, + .delete_ctrl = nvme_loop_delete_ctrl, +}; + +static struct nvmf_transport_ops nvme_loop_transport = { + .name = "loop", + .create_ctrl = nvme_loop_create_ctrl, +}; + +static int __init nvme_loop_init_module(void) +{ + int ret; + + ret = nvmet_register_transport(&nvme_loop_ops); + if (ret) + return ret; + nvmf_register_transport(&nvme_loop_transport); + return 0; +} + +static void __exit nvme_loop_cleanup_module(void) +{ + struct nvme_loop_ctrl *ctrl, *next; + + nvmf_unregister_transport(&nvme_loop_transport); + nvmet_unregister_transport(&nvme_loop_ops); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) + __nvme_loop_del_ctrl(ctrl); + mutex_unlock(&nvme_loop_ctrl_mutex); + + flush_scheduled_work(); +} + +module_init(nvme_loop_init_module); +module_exit(nvme_loop_cleanup_module); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h new file mode 100644 index 000000000000..57dd6d834c28 --- /dev/null +++ b/drivers/nvme/target/nvmet.h @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVMET_H +#define _NVMET_H + +#include <linux/dma-mapping.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/kref.h> +#include <linux/percpu-refcount.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/nvme.h> +#include <linux/configfs.h> +#include <linux/rcupdate.h> +#include <linux/blkdev.h> + +#define NVMET_ASYNC_EVENTS 4 +#define NVMET_ERROR_LOG_SLOTS 128 + +/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM + * The 16 bit shift is to set IATTR bit to 1, which means offending + * offset starts in the data section of connect() + */ +#define IPO_IATTR_CONNECT_DATA(x) \ + (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) +#define IPO_IATTR_CONNECT_SQE(x) \ + (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) + +struct nvmet_ns { + struct list_head dev_link; + struct percpu_ref ref; + struct block_device *bdev; + u32 nsid; + u32 blksize_shift; + loff_t size; + u8 nguid[16]; + + struct nvmet_subsys *subsys; + const char *device_path; + + struct config_group device_group; + struct config_group group; + + struct completion disable_done; +}; + +static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ns, group); +} + +static inline bool nvmet_ns_enabled(struct nvmet_ns *ns) +{ + return !list_empty_careful(&ns->dev_link); +} + +struct nvmet_cq { + u16 qid; + u16 size; +}; + +struct nvmet_sq { + struct nvmet_ctrl *ctrl; + struct percpu_ref ref; + u16 qid; + u16 size; + struct completion free_done; +}; + +/** + * struct nvmet_port - Common structure to keep port + * information for the target. + * @entry: List head for holding a list of these elements. + * @disc_addr: Address information is stored in a format defined + * for a discovery log page entry. + * @group: ConfigFS group for this element's folder. + * @priv: Private data for the transport. + */ +struct nvmet_port { + struct list_head entry; + struct nvmf_disc_rsp_page_entry disc_addr; + struct config_group group; + struct config_group subsys_group; + struct list_head subsystems; + struct config_group referrals_group; + struct list_head referrals; + void *priv; + bool enabled; +}; + +static inline struct nvmet_port *to_nvmet_port(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + group); +} + +struct nvmet_ctrl { + struct nvmet_subsys *subsys; + struct nvmet_cq **cqs; + struct nvmet_sq **sqs; + + struct mutex lock; + u64 cap; + u32 cc; + u32 csts; + + u16 cntlid; + u32 kato; + + struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; + unsigned int nr_async_event_cmds; + struct list_head async_events; + struct work_struct async_event_work; + + struct list_head subsys_entry; + struct kref ref; + struct delayed_work ka_work; + struct work_struct fatal_err_work; + + struct nvmet_fabrics_ops *ops; + + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; +}; + +struct nvmet_subsys { + enum nvme_subsys_type type; + + struct mutex lock; + struct kref ref; + + struct list_head namespaces; + unsigned int max_nsid; + + struct list_head ctrls; + struct ida cntlid_ida; + + struct list_head hosts; + bool allow_any_host; + + u16 max_qid; + + u64 ver; + char *subsysnqn; + + struct config_group group; + + struct config_group namespaces_group; + struct config_group allowed_hosts_group; +}; + +static inline struct nvmet_subsys *to_subsys(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, group); +} + +static inline struct nvmet_subsys *namespaces_to_subsys( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, + namespaces_group); +} + +struct nvmet_host { + struct config_group group; +}; + +static inline struct nvmet_host *to_host(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_host, group); +} + +static inline char *nvmet_host_name(struct nvmet_host *host) +{ + return config_item_name(&host->group.cg_item); +} + +struct nvmet_host_link { + struct list_head entry; + struct nvmet_host *host; +}; + +struct nvmet_subsys_link { + struct list_head entry; + struct nvmet_subsys *subsys; +}; + +struct nvmet_req; +struct nvmet_fabrics_ops { + struct module *owner; + unsigned int type; + unsigned int sqe_inline_size; + unsigned int msdbd; + bool has_keyed_sgls : 1; + void (*queue_response)(struct nvmet_req *req); + int (*add_port)(struct nvmet_port *port); + void (*remove_port)(struct nvmet_port *port); + void (*delete_ctrl)(struct nvmet_ctrl *ctrl); +}; + +#define NVMET_MAX_INLINE_BIOVEC 8 + +struct nvmet_req { + struct nvme_command *cmd; + struct nvme_completion *rsp; + struct nvmet_sq *sq; + struct nvmet_cq *cq; + struct nvmet_ns *ns; + struct scatterlist *sg; + struct bio inline_bio; + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + int sg_cnt; + size_t data_len; + + struct nvmet_port *port; + + void (*execute)(struct nvmet_req *req); + struct nvmet_fabrics_ops *ops; +}; + +static inline void nvmet_set_status(struct nvmet_req *req, u16 status) +{ + req->rsp->status = cpu_to_le16(status << 1); +} + +static inline void nvmet_set_result(struct nvmet_req *req, u32 result) +{ + req->rsp->result = cpu_to_le32(result); +} + +/* + * NVMe command writes actually are DMA reads for us on the target side. + */ +static inline enum dma_data_direction +nvmet_data_dir(struct nvmet_req *req) +{ + return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +struct nvmet_async_event { + struct list_head entry; + u8 event_type; + u8 event_info; + u8 log_page; +}; + +int nvmet_parse_connect_cmd(struct nvmet_req *req); +int nvmet_parse_io_cmd(struct nvmet_req *req); +int nvmet_parse_admin_cmd(struct nvmet_req *req); +int nvmet_parse_discovery_cmd(struct nvmet_req *req); +int nvmet_parse_fabrics_cmd(struct nvmet_req *req); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); +void nvmet_req_complete(struct nvmet_req *req, u16 status); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); +void nvmet_sq_destroy(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq); + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret); +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type); +void nvmet_subsys_put(struct nvmet_subsys *subsys); + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid); +void nvmet_put_namespace(struct nvmet_ns *ns); +int nvmet_ns_enable(struct nvmet_ns *ns); +void nvmet_ns_disable(struct nvmet_ns *ns); +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_ns_free(struct nvmet_ns *ns); + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops); +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops); + +int nvmet_enable_port(struct nvmet_port *port); +void nvmet_disable_port(struct nvmet_port *port); + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *port); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); + +u32 nvmet_get_log_page_len(struct nvme_command *cmd); + +#define NVMET_QUEUE_SIZE 1024 +#define NVMET_NR_QUEUES 64 +#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_KAS 10 +#define NVMET_DISC_KATO 120 + +int __init nvmet_init_configfs(void); +void __exit nvmet_exit_configfs(void); + +int __init nvmet_init_discovery(void); +void nvmet_exit_discovery(void); + +extern struct nvmet_subsys *nvmet_disc_subsys; +extern u64 nvmet_genctr; +extern struct rw_semaphore nvmet_config_sem; + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn); + +#endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c new file mode 100644 index 000000000000..e06d504bdf0c --- /dev/null +++ b/drivers/nvme/target/rdma.c @@ -0,0 +1,1448 @@ +/* + * NVMe over Fabrics RDMA target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/atomic.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nvme.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/wait.h> +#include <linux/inet.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/rw.h> + +#include <linux/nvme-rdma.h> +#include "nvmet.h" + +/* + * We allow up to a page of inline data to go with the SQE + */ +#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE + +struct nvmet_rdma_cmd { + struct ib_sge sge[2]; + struct ib_cqe cqe; + struct ib_recv_wr wr; + struct scatterlist inline_sg; + struct page *inline_page; + struct nvme_command *nvme_cmd; + struct nvmet_rdma_queue *queue; +}; + +enum { + NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), + NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), +}; + +struct nvmet_rdma_rsp { + struct ib_sge send_sge; + struct ib_cqe send_cqe; + struct ib_send_wr send_wr; + + struct nvmet_rdma_cmd *cmd; + struct nvmet_rdma_queue *queue; + + struct ib_cqe read_cqe; + struct rdma_rw_ctx rw; + + struct nvmet_req req; + + u8 n_rdma; + u32 flags; + u32 invalidate_rkey; + + struct list_head wait_list; + struct list_head free_list; +}; + +enum nvmet_rdma_queue_state { + NVMET_RDMA_Q_CONNECTING, + NVMET_RDMA_Q_LIVE, + NVMET_RDMA_Q_DISCONNECTING, +}; + +struct nvmet_rdma_queue { + struct rdma_cm_id *cm_id; + struct nvmet_port *port; + struct ib_cq *cq; + atomic_t sq_wr_avail; + struct nvmet_rdma_device *dev; + spinlock_t state_lock; + enum nvmet_rdma_queue_state state; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + struct nvmet_rdma_rsp *rsps; + struct list_head free_rsps; + spinlock_t rsps_lock; + struct nvmet_rdma_cmd *cmds; + + struct work_struct release_work; + struct list_head rsp_wait_list; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; + + int idx; + int host_qid; + int recv_queue_size; + int send_queue_size; + + struct list_head queue_list; +}; + +struct nvmet_rdma_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_srq *srq; + struct nvmet_rdma_cmd *srq_cmds; + size_t srq_size; + struct kref ref; + struct list_head entry; +}; + +static bool nvmet_rdma_use_srq; +module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); +MODULE_PARM_DESC(use_srq, "Use shared receive queue."); + +static DEFINE_IDA(nvmet_rdma_queue_ida); +static LIST_HEAD(nvmet_rdma_queue_list); +static DEFINE_MUTEX(nvmet_rdma_queue_mutex); + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); + +static struct nvmet_fabrics_ops nvmet_rdma_ops; + +/* XXX: really should move to a generic header sooner or later.. */ +static inline u32 get_unaligned_le24(const u8 *p) +{ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; +} + +static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) +{ + return nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) +{ + return !nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !rsp->req.rsp->status && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline struct nvmet_rdma_rsp * +nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_rsp *rsp; + unsigned long flags; + + spin_lock_irqsave(&queue->rsps_lock, flags); + rsp = list_first_entry(&queue->free_rsps, + struct nvmet_rdma_rsp, free_list); + list_del(&rsp->free_list); + spin_unlock_irqrestore(&queue->rsps_lock, flags); + + return rsp; +} + +static inline void +nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + + spin_lock_irqsave(&rsp->queue->rsps_lock, flags); + list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); + spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); +} + +static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) +{ + struct scatterlist *sg; + int count; + + if (!sgl || !nents) + return; + + for_each_sg(sgl, sg, nents, count) + __free_page(sg_page(sg)); + kfree(sgl); +} + +static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, + u32 length) +{ + struct scatterlist *sg; + struct page *page; + unsigned int nent; + int i = 0; + + nent = DIV_ROUND_UP(length, PAGE_SIZE); + sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + goto out; + + sg_init_table(sg, nent); + + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); + + page = alloc_page(GFP_KERNEL); + if (!page) + goto out_free_pages; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; + } + *sgl = sg; + *nents = nent; + return 0; + +out_free_pages: + while (i > 0) { + i--; + __free_page(sg_page(&sg[i])); + } + kfree(sg); +out: + return NVME_SC_INTERNAL; +} + +static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + /* NVMe command / RDMA RECV */ + c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); + if (!c->nvme_cmd) + goto out; + + c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) + goto out_free_cmd; + + c->sge[0].length = sizeof(*c->nvme_cmd); + c->sge[0].lkey = ndev->pd->local_dma_lkey; + + if (!admin) { + c->inline_page = alloc_pages(GFP_KERNEL, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + if (!c->inline_page) + goto out_unmap_cmd; + c->sge[1].addr = ib_dma_map_page(ndev->device, + c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) + goto out_free_inline_page; + c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; + c->sge[1].lkey = ndev->pd->local_dma_lkey; + } + + c->cqe.done = nvmet_rdma_recv_done; + + c->wr.wr_cqe = &c->cqe; + c->wr.sg_list = c->sge; + c->wr.num_sge = admin ? 1 : 2; + + return 0; + +out_free_inline_page: + if (!admin) { + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } +out_unmap_cmd: + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); +out_free_cmd: + kfree(c->nvme_cmd); + +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + if (!admin) { + ib_dma_unmap_page(ndev->device, c->sge[1].addr, + NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + kfree(c->nvme_cmd); +} + +static struct nvmet_rdma_cmd * +nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, + int nr_cmds, bool admin) +{ + struct nvmet_rdma_cmd *cmds; + int ret = -EINVAL, i; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); + if (ret) + goto out_free; + } + + return cmds; + +out_free: + while (--i >= 0) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +out: + return ERR_PTR(ret); +} + +static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) +{ + int i; + + for (i = 0; i < nr_cmds; i++) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +} + +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + /* NVMe CQE / RDMA SEND */ + r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); + if (!r->req.rsp) + goto out; + + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) + goto out_free_rsp; + + r->send_sge.length = sizeof(*r->req.rsp); + r->send_sge.lkey = ndev->pd->local_dma_lkey; + + r->send_cqe.done = nvmet_rdma_send_done; + + r->send_wr.wr_cqe = &r->send_cqe; + r->send_wr.sg_list = &r->send_sge; + r->send_wr.num_sge = 1; + r->send_wr.send_flags = IB_SEND_SIGNALED; + + /* Data In / RDMA READ */ + r->read_cqe.done = nvmet_rdma_read_data_done; + return 0; + +out_free_rsp: + kfree(r->req.rsp); +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + ib_dma_unmap_single(ndev->device, r->send_sge.addr, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + kfree(r->req.rsp); +} + +static int +nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int nr_rsps = queue->recv_queue_size * 2; + int ret = -EINVAL, i; + + queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + GFP_KERNEL); + if (!queue->rsps) + goto out; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + ret = nvmet_rdma_alloc_rsp(ndev, rsp); + if (ret) + goto out_free; + + list_add_tail(&rsp->free_list, &queue->free_rsps); + } + + return 0; + +out_free: + while (--i >= 0) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +out: + return ret; +} + +static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int i, nr_rsps = queue->recv_queue_size * 2; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +} + +static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmd) +{ + struct ib_recv_wr *bad_wr; + + if (ndev->srq) + return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); +} + +static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) +{ + spin_lock(&queue->rsp_wr_wait_lock); + while (!list_empty(&queue->rsp_wr_wait_list)) { + struct nvmet_rdma_rsp *rsp; + bool ret; + + rsp = list_entry(queue->rsp_wr_wait_list.next, + struct nvmet_rdma_rsp, wait_list); + list_del(&rsp->wait_list); + + spin_unlock(&queue->rsp_wr_wait_lock); + ret = nvmet_rdma_execute_command(rsp); + spin_lock(&queue->rsp_wr_wait_lock); + + if (!ret) { + list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); + break; + } + } + spin_unlock(&queue->rsp_wr_wait_lock); +} + + +static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + + if (rsp->n_rdma) { + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + } + + if (rsp->req.sg != &rsp->cmd->inline_sg) + nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + + if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) + nvmet_rdma_process_wr_wait_list(queue); + + nvmet_rdma_put_rsp(rsp); +} + +static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) +{ + if (queue->nvme_sq.ctrl) { + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + } else { + /* + * we didn't setup the controller yet in case + * of admin connect error, just disconnect and + * cleanup the queue + */ + nvmet_rdma_queue_disconnect(queue); + } +} + +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + + nvmet_rdma_release_rsp(rsp); + + if (unlikely(wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR)) { + pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(rsp->queue); + } +} + +static void nvmet_rdma_queue_response(struct nvmet_req *req) +{ + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct ib_send_wr *first_wr, *bad_wr; + + if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; + rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; + } else { + rsp->send_wr.opcode = IB_WR_SEND; + } + + if (nvmet_rdma_need_data_out(rsp)) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + else + first_wr = &rsp->send_wr; + + nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); + if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, + u64 off) +{ + sg_init_table(&rsp->cmd->inline_sg, 1); + sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); + rsp->req.sg = &rsp->cmd->inline_sg; + rsp->req.sg_cnt = 1; +} + +static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; + u64 off = le64_to_cpu(sgl->addr); + u32 len = le32_to_cpu(sgl->length); + + if (!nvme_is_write(rsp->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + pr_err("invalid inline data offset!\n"); + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + } + + /* no data command? */ + if (!len) + return 0; + + nvmet_rdma_use_inline_sg(rsp, len, off); + rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + return 0; +} + +static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, + struct nvme_keyed_sgl_desc *sgl, bool invalidate) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u64 addr = le64_to_cpu(sgl->addr); + u32 len = get_unaligned_le24(sgl->length); + u32 key = get_unaligned_le32(sgl->key); + int ret; + u16 status; + + /* no data command? */ + if (!len) + return 0; + + /* use the already allocated data buffer if possible */ + if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) { + nvmet_rdma_use_inline_sg(rsp, len, 0); + } else { + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, + len); + if (status) + return status; + } + + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, + nvmet_data_dir(&rsp->req)); + if (ret < 0) + return NVME_SC_INTERNAL; + rsp->n_rdma += ret; + + if (invalidate) { + rsp->invalidate_rkey = key; + rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; + } + + return 0; +} + +static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; + + switch (sgl->type >> 4) { + case NVME_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_OFFSET: + return nvmet_rdma_map_sgl_inline(rsp); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + case NVME_KEY_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); + case NVME_SGL_FMT_ADDRESS: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + default: + pr_err("invalid SGL type: %#x\n", sgl->type); + return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + } +} + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + if (unlikely(atomic_sub_return(1 + rsp->n_rdma, + &queue->sq_wr_avail) < 0)) { + pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", + 1 + rsp->n_rdma, queue->idx, + queue->nvme_sq.ctrl->cntlid); + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + return false; + } + + if (nvmet_rdma_need_data_in(rsp)) { + if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, &rsp->read_cqe, NULL)) + nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + } else { + rsp->req.execute(&rsp->req); + } + + return true; +} + +static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *cmd) +{ + u16 status; + + cmd->queue = queue; + cmd->n_rdma = 0; + cmd->req.port = queue->port; + + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_rdma_ops)) + return; + + status = nvmet_rdma_map_sgl(cmd); + if (status) + goto out_err; + + if (unlikely(!nvmet_rdma_execute_command(cmd))) { + spin_lock(&queue->rsp_wr_wait_lock); + list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); + spin_unlock(&queue->rsp_wr_wait_lock); + } + + return; + +out_err: + nvmet_req_complete(&cmd->req, status); +} + +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_cmd *cmd = + container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_rsp *rsp; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), + wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { + pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); + nvmet_rdma_error_comp(queue); + return; + } + + cmd->queue = queue; + rsp = nvmet_rdma_get_rsp(queue); + rsp->cmd = cmd; + rsp->flags = 0; + rsp->req.cmd = cmd->nvme_cmd; + + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return; + } + + nvmet_rdma_handle_command(queue, rsp); +} + +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +{ + if (!ndev->srq) + return; + + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); + ib_destroy_srq(ndev->srq); +} + +static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + struct ib_srq *srq; + size_t srq_size; + int ret, i; + + srq_size = 4095; /* XXX: tune */ + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 2; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(ndev->srq_cmds)) { + ret = PTR_ERR(ndev->srq_cmds); + goto out_destroy_srq; + } + + ndev->srq = srq; + ndev->srq_size = srq_size; + + for (i = 0; i < srq_size; i++) + nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + + return 0; + +out_destroy_srq: + ib_destroy_srq(srq); + return ret; +} + +static void nvmet_rdma_free_dev(struct kref *ref) +{ + struct nvmet_rdma_device *ndev = + container_of(ref, struct nvmet_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + nvmet_rdma_destroy_srq(ndev); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static struct nvmet_rdma_device * +nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvmet_rdma_device *ndev; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device->node_guid == cm_id->device->node_guid && + kref_get_unless_zero(&ndev->ref)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->device = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->device); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (nvmet_rdma_use_srq) { + ret = nvmet_rdma_init_srq(ndev); + if (ret) + goto out_free_pd; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + pr_debug("added %s.\n", ndev->device->name); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) +{ + struct ib_qp_init_attr qp_attr; + struct nvmet_rdma_device *ndev = queue->dev; + int comp_vector, nr_cqe, ret, i; + + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + /* + * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. + */ + nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; + + queue->cq = ib_alloc_cq(ndev->device, queue, + nr_cqe + 1, comp_vector, + IB_POLL_WORKQUEUE); + if (IS_ERR(queue->cq)) { + ret = PTR_ERR(queue->cq); + pr_err("failed to create CQ cqe= %d ret= %d\n", + nr_cqe + 1, ret); + goto out; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_context = queue; + qp_attr.event_handler = nvmet_rdma_qp_event; + qp_attr.send_cq = queue->cq; + qp_attr.recv_cq = queue->cq; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + /* +1 for drain */ + qp_attr.cap.max_send_wr = queue->send_queue_size + 1; + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, + ndev->device->attrs.max_sge); + + if (ndev->srq) { + qp_attr.srq = ndev->srq; + } else { + /* +1 for drain */ + qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; + qp_attr.cap.max_recv_sge = 2; + } + + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); + if (ret) { + pr_err("failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, + qp_attr.cap.max_send_wr, queue->cm_id); + + if (!ndev->srq) { + for (i = 0; i < queue->recv_queue_size; i++) { + queue->cmds[i].queue = queue; + nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + } + } + +out: + return ret; + +err_destroy_cq: + ib_free_cq(queue->cq); + goto out; +} + +static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) +{ + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->cq); +} + +static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) +{ + pr_info("freeing queue %d\n", queue->idx); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_rdma_destroy_queue_ib(queue); + if (!queue->dev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } + nvmet_rdma_free_rsps(queue); + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + kfree(queue); +} + +static void nvmet_rdma_release_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, release_work); + struct rdma_cm_id *cm_id = queue->cm_id; + struct nvmet_rdma_device *dev = queue->dev; + + nvmet_rdma_free_queue(queue); + rdma_destroy_id(cm_id); + kref_put(&dev->ref, nvmet_rdma_free_dev); +} + +static int +nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, + struct nvmet_rdma_queue *queue) +{ + struct nvme_rdma_cm_req *req; + + req = (struct nvme_rdma_cm_req *)conn->private_data; + if (!req || conn->private_data_len == 0) + return NVME_RDMA_CM_INVALID_LEN; + + if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) + return NVME_RDMA_CM_INVALID_RECFMT; + + queue->host_qid = le16_to_cpu(req->qid); + + /* + * req->hsqsize corresponds to our recv queue size + * req->hrqsize corresponds to our send queue size + */ + queue->recv_queue_size = le16_to_cpu(req->hsqsize); + queue->send_queue_size = le16_to_cpu(req->hrqsize); + + if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + return NVME_RDMA_CM_INVALID_HSQSIZE; + + /* XXX: Should we enforce some kind of max for IO queues? */ + + return 0; +} + +static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, + enum nvme_rdma_cm_status status) +{ + struct nvme_rdma_cm_rej rej; + + rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + rej.sts = cpu_to_le16(status); + + return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); +} + +static struct nvmet_rdma_queue * +nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, + struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_reject; + } + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_queue; + + ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); + if (ret) + goto out_destroy_sq; + + /* + * Schedules the actual release because calling rdma_destroy_id from + * inside a CM callback would trigger a deadlock. (great API design..) + */ + INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); + queue->dev = ndev; + queue->cm_id = cm_id; + + spin_lock_init(&queue->state_lock); + queue->state = NVMET_RDMA_Q_CONNECTING; + INIT_LIST_HEAD(&queue->rsp_wait_list); + INIT_LIST_HEAD(&queue->rsp_wr_wait_list); + spin_lock_init(&queue->rsp_wr_wait_lock); + INIT_LIST_HEAD(&queue->free_rsps); + spin_lock_init(&queue->rsps_lock); + + queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_queue; + } + + ret = nvmet_rdma_alloc_rsps(queue); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_ida_remove; + } + + if (!ndev->srq) { + queue->cmds = nvmet_rdma_alloc_cmds(ndev, + queue->recv_queue_size, + !queue->host_qid); + if (IS_ERR(queue->cmds)) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_responses; + } + } + + ret = nvmet_rdma_create_queue_ib(queue); + if (ret) { + pr_err("%s: creating RDMA queue failed (%d).\n", + __func__, ret); + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_cmds; + } + + return queue; + +out_free_cmds: + if (!ndev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } +out_free_responses: + nvmet_rdma_free_rsps(queue); +out_ida_remove: + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); +out_destroy_sq: + nvmet_sq_destroy(&queue->nvme_sq); +out_free_queue: + kfree(queue); +out_reject: + nvmet_rdma_cm_reject(cm_id, ret); + return NULL; +} + +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_queue *queue = priv; + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(queue->cm_id, event->event); + break; + default: + pr_err("received unrecognized IB QP event %d\n", event->event); + break; + } +} + +static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue, + struct rdma_conn_param *p) +{ + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_rep priv = { }; + int ret = -ENOMEM; + + param.rnr_retry_count = 7; + param.flow_control = 1; + param.initiator_depth = min_t(u8, p->initiator_depth, + queue->dev->device->attrs.max_qp_init_rd_atom); + param.private_data = &priv; + param.private_data_len = sizeof(priv); + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.crqsize = cpu_to_le16(queue->recv_queue_size); + + ret = rdma_accept(cm_id, ¶m); + if (ret) + pr_err("rdma_accept failed (error code = %d)\n", ret); + + return ret; +} + +static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_device *ndev; + struct nvmet_rdma_queue *queue; + int ret = -EINVAL; + + ndev = nvmet_rdma_find_get_device(cm_id); + if (!ndev) { + pr_err("no client data!\n"); + nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); + return -ECONNREFUSED; + } + + queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); + if (!queue) { + ret = -ENOMEM; + goto put_device; + } + queue->port = cm_id->context; + + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); + if (ret) + goto release_queue; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + return 0; + +release_queue: + nvmet_rdma_free_queue(queue); +put_device: + kref_put(&ndev->ref, nvmet_rdma_free_dev); + + return ret; +} + +static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_CONNECTING) { + pr_warn("trying to establish a connected queue\n"); + goto out_unlock; + } + queue->state = NVMET_RDMA_Q_LIVE; + + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *cmd; + + cmd = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, wait_list); + list_del(&cmd->wait_list); + + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_handle_command(queue, cmd); + spin_lock_irqsave(&queue->state_lock, flags); + } + +out_unlock: + spin_unlock_irqrestore(&queue->state_lock, flags); +} + +static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + unsigned long flags; + + pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); + + spin_lock_irqsave(&queue->state_lock, flags); + switch (queue->state) { + case NVMET_RDMA_Q_CONNECTING: + case NVMET_RDMA_Q_LIVE: + disconnect = true; + queue->state = NVMET_RDMA_Q_DISCONNECTING; + break; + case NVMET_RDMA_Q_DISCONNECTING: + break; + } + spin_unlock_irqrestore(&queue->state_lock, flags); + + if (disconnect) { + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->cm_id->qp); + schedule_work(&queue->release_work); + } +} + +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) { + list_del_init(&queue->queue_list); + disconnect = true; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + if (disconnect) + __nvmet_rdma_queue_disconnect(queue); +} + +static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); + + pr_err("failed to connect queue\n"); + schedule_work(&queue->release_work); +} + +static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue = NULL; + int ret = 0; + + if (cm_id->qp) + queue = cm_id->qp->qp_context; + + pr_debug("%s (%d): status %d id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cm_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = nvmet_rdma_queue_connect(cm_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + nvmet_rdma_queue_established(queue); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* + * We can get the device removal callback even for a + * CM ID that we aren't actually using. In that case + * the context pointer is NULL, so we shouldn't try + * to disconnect a non-existing queue. But we also + * need to return 1 so that the core will destroy + * it's own ID. What a great API design.. + */ + if (queue) + nvmet_rdma_queue_disconnect(queue); + else + ret = 1; + break; + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + nvmet_rdma_queue_connect_fail(cm_id, queue); + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + +static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_queue *queue; + +restart: + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl == ctrl) { + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + __nvmet_rdma_queue_disconnect(queue); + goto restart; + } + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static int nvmet_rdma_add_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id; + struct sockaddr_in addr_in; + u16 port_in; + int ret; + + switch (port->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + break; + default: + pr_err("address family %d not supported\n", + port->disc_addr.adrfam); + return -EINVAL; + } + + ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in); + if (ret) + return ret; + + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr); + addr_in.sin_port = htons(port_in); + + cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("CM ID creation failed\n"); + return PTR_ERR(cm_id); + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in); + if (ret) { + pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(port->disc_addr.portid), &addr_in); + port->priv = cm_id; + return 0; + +out_destroy_id: + rdma_destroy_id(cm_id); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id = port->priv; + + rdma_destroy_id(cm_id); +} + +static struct nvmet_fabrics_ops nvmet_rdma_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_RDMA, + .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, + .msdbd = 1, + .has_keyed_sgls = 1, + .add_port = nvmet_rdma_add_port, + .remove_port = nvmet_rdma_remove_port, + .queue_response = nvmet_rdma_queue_response, + .delete_ctrl = nvmet_rdma_delete_ctrl, +}; + +static int __init nvmet_rdma_init(void) +{ + return nvmet_register_transport(&nvmet_rdma_ops); +} + +static void __exit nvmet_rdma_exit(void) +{ + struct nvmet_rdma_queue *queue; + + nvmet_unregister_transport(&nvmet_rdma_ops); + + flush_scheduled_work(); + + mutex_lock(&nvmet_rdma_queue_mutex); + while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, + struct nvmet_rdma_queue, queue_list))) { + list_del_init(&queue->queue_list); + + mutex_unlock(&nvmet_rdma_queue_mutex); + __nvmet_rdma_queue_disconnect(queue); + mutex_lock(&nvmet_rdma_queue_mutex); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); + ida_destroy(&nvmet_rdma_queue_ida); +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2498fdf3a503..e43bbffb5b7a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int, unsigned int, unsigned int); typedef void (exit_request_fn)(void *, struct request *, unsigned int, unsigned int); +typedef int (reinit_request_fn)(void *, struct request *); typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); @@ -145,6 +146,7 @@ struct blk_mq_ops { */ init_request_fn *init_request; exit_request_fn *exit_request; + reinit_request_fn *reinit_request; }; enum { @@ -196,6 +198,8 @@ enum { struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, + unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); @@ -243,6 +247,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ef2c7d2e76c4..ba78b8306674 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,7 +1,9 @@ #ifndef NVM_H #define NVM_H +#include <linux/blkdev.h> #include <linux/types.h> +#include <uapi/linux/lightnvm.h> enum { NVM_IO_OK = 0, @@ -269,24 +271,15 @@ struct nvm_lun { int lun_id; int chnl_id; - /* It is up to the target to mark blocks as closed. If the target does - * not do it, all blocks are marked as open, and nr_open_blocks - * represents the number of blocks in use - */ - unsigned int nr_open_blocks; /* Number of used, writable blocks */ - unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ - unsigned int nr_free_blocks; /* Number of unused blocks */ - unsigned int nr_bad_blocks; /* Number of bad blocks */ - spinlock_t lock; + unsigned int nr_free_blocks; /* Number of unused blocks */ struct nvm_block *blocks; }; enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ - NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ - NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; @@ -385,6 +378,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, { struct ppa_addr l; + l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ @@ -455,6 +449,8 @@ struct nvm_tgt_type { struct list_head list; }; +extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); + extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -463,6 +459,9 @@ extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); typedef int (nvmm_register_fn)(struct nvm_dev *); typedef void (nvmm_unregister_fn)(struct nvm_dev *); + +typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); +typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, struct nvm_lun *, unsigned long); typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); @@ -488,9 +487,10 @@ struct nvmm_type { nvmm_register_fn *register_mgr; nvmm_unregister_fn *unregister_mgr; + nvmm_create_tgt_fn *create_tgt; + nvmm_remove_tgt_fn *remove_tgt; + /* Block administration callbacks */ - nvmm_get_blk_fn *get_blk_unlocked; - nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -520,10 +520,6 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); -extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, - struct nvm_lun *, unsigned long); -extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); - extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); @@ -532,11 +528,13 @@ extern int nvm_register(struct request_queue *, char *, struct nvm_dev_ops *); extern void nvm_unregister(char *); +void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); + extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, - struct ppa_addr *, int, int); + const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h new file mode 100644 index 000000000000..bf240a3cbf99 --- /dev/null +++ b/include/linux/nvme-rdma.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_RDMA_H +#define _LINUX_NVME_RDMA_H + +enum nvme_rdma_cm_fmt { + NVME_RDMA_CM_FMT_1_0 = 0x0, +}; + +enum nvme_rdma_cm_status { + NVME_RDMA_CM_INVALID_LEN = 0x01, + NVME_RDMA_CM_INVALID_RECFMT = 0x02, + NVME_RDMA_CM_INVALID_QID = 0x03, + NVME_RDMA_CM_INVALID_HSQSIZE = 0x04, + NVME_RDMA_CM_INVALID_HRQSIZE = 0x05, + NVME_RDMA_CM_NO_RSC = 0x06, + NVME_RDMA_CM_INVALID_IRD = 0x07, + NVME_RDMA_CM_INVALID_ORD = 0x08, +}; + +/** + * struct nvme_rdma_cm_req - rdma connect request + * + * @recfmt: format of the RDMA Private Data + * @qid: queue Identifier for the Admin or I/O Queue + * @hrqsize: host receive queue size to be created + * @hsqsize: host send queue size to be created + */ +struct nvme_rdma_cm_req { + __le16 recfmt; + __le16 qid; + __le16 hrqsize; + __le16 hsqsize; + u8 rsvd[24]; +}; + +/** + * struct nvme_rdma_cm_rep - rdma connect reply + * + * @recfmt: format of the RDMA Private Data + * @crqsize: controller receive queue size + */ +struct nvme_rdma_cm_rep { + __le16 recfmt; + __le16 crqsize; + u8 rsvd[28]; +}; + +/** + * struct nvme_rdma_cm_rej - rdma connect reject + * + * @recfmt: format of the RDMA Private Data + * @fsts: error status for the associated connect request + */ +struct nvme_rdma_cm_rej { + __le16 recfmt; + __le16 sts; +}; + +#endif /* _LINUX_NVME_RDMA_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index dc815cc6718d..d8b37bab2887 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -16,6 +16,78 @@ #define _LINUX_NVME_H #include <linux/types.h> +#include <linux/uuid.h> + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ +}; + +#define NVMF_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ @@ -117,7 +189,8 @@ struct nvme_id_ctrl { __le32 rtd3r; __le32 rtd3e; __le32 oaes; - __u8 rsvd96[160]; + __le32 ctratt; + __u8 rsvd100[156]; __le16 oacs; __u8 acl; __u8 aerl; @@ -129,10 +202,12 @@ struct nvme_id_ctrl { __u8 apsta; __le16 wctemp; __le16 cctemp; - __u8 rsvd270[242]; + __u8 rsvd270[50]; + __le16 kas; + __u8 rsvd322[190]; __u8 sqes; __u8 cqes; - __u8 rsvd514[2]; + __le16 maxcmd; __le32 nn; __le16 oncs; __le16 fuses; @@ -145,7 +220,15 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[1508]; + __u8 rsvd540[228]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; struct nvme_id_power_state psd[32]; __u8 vs[1024]; }; @@ -307,6 +390,61 @@ enum nvme_opcode { }; /* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* * Lowest two bits of our flags field (FUSE field in the spec): * * @NVME_CMD_FUSE_FIRST: Fused Operation, first command @@ -336,8 +474,7 @@ struct nvme_common_command { __le32 nsid; __le32 cdw2[2]; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cdw10[6]; }; @@ -348,8 +485,7 @@ struct nvme_rw_command { __le32 nsid; __u64 rsvd2; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le64 slba; __le16 length; __le16 control; @@ -389,8 +525,7 @@ struct nvme_dsm_cmd { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 nr; __le32 attributes; __u32 rsvd12[4]; @@ -423,6 +558,7 @@ enum nvme_admin_opcode { nvme_admin_async_event = 0x0c, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_keep_alive = 0x18, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, @@ -447,6 +583,7 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -454,6 +591,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), NVME_FWACT_REPL_ACTV = (1 << 3), @@ -466,8 +604,7 @@ struct nvme_identify { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cns; __u32 rsvd11[5]; }; @@ -478,8 +615,7 @@ struct nvme_features { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 fid; __le32 dword11; __u32 rsvd12[4]; @@ -538,8 +674,7 @@ struct nvme_download_firmware { __u8 flags; __u16 command_id; __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 numd; __le32 offset; __u32 rsvd12[4]; @@ -561,8 +696,7 @@ struct nvme_get_log_page_command { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __u8 lid; __u8 rsvd10; __le16 numdl; @@ -573,6 +707,126 @@ struct nvme_get_log_page_command { __u32 rsvd14[2]; }; +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 nqntype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_le hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -587,15 +841,29 @@ struct nvme_command { struct nvme_dsm_cmd dsm; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; }; }; static inline bool nvme_is_write(struct nvme_command *cmd) { + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.opcode & 1; return cmd->common.opcode & 1; } enum { + /* + * Generic Command Status: + */ NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, NVME_SC_INVALID_FIELD = 0x2, @@ -614,10 +882,18 @@ enum { NVME_SC_SGL_INVALID_DATA = 0xf, NVME_SC_SGL_INVALID_METADATA = 0x10, NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -635,9 +911,29 @@ enum { NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, NVME_SC_FEATURE_NOT_PER_NS = 0x10f, NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + + /* + * I/O Command Set Specific - NVM commands: + */ NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, @@ -645,12 +941,19 @@ enum { NVME_SC_REFTAG_CHECK = 0x284, NVME_SC_COMPARE_FAILED = 0x285, NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_DNR = 0x4000, }; struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; + /* + * Used by Admin and Fabrics commands to return data: + */ + union { + __le16 result16; + __le32 result; + __le64 result64; + }; __le16 sq_head; /* how much of this queue may be reclaimed */ __le16 sq_id; /* submission queue that generated this entry */ __u16 command_id; /* of the command which completed */ |