diff options
72 files changed, 3761 insertions, 2688 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl index 4f676838da06..bcdfdb9a9277 100644 --- a/Documentation/DocBook/filesystems.tmpl +++ b/Documentation/DocBook/filesystems.tmpl @@ -62,7 +62,7 @@ !Efs/mpage.c !Efs/namei.c !Efs/buffer.c -!Efs/bio.c +!Eblock/bio.c !Efs/seq_file.c !Efs/filesystems.c !Efs/fs-writeback.c diff --git a/block/Makefile b/block/Makefile index 20645e88fb57..a2ce6ac935ec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -2,13 +2,15 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o partitions/ + genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + partitions/ +obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o @@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o diff --git a/fs/bio-integrity.c b/block/bio-integrity.c index 1c2ce0c87711..9e241063a616 100644 --- a/fs/bio-integrity.c +++ b/block/bio-integrity.c @@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) if (!bs->bio_integrity_pool) return -1; - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + bs->bvec_integrity_pool = biovec_create_pool(pool_size); if (!bs->bvec_integrity_pool) { mempool_destroy(bs->bio_integrity_pool); return -1; diff --git a/fs/bio.c b/block/bio.c index 6f0362b77806..96d28eee8a1e 100644 --- a/fs/bio.c +++ b/block/bio.c @@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error) /** * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the @bio's parent bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have @@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, bio->bi_private = bmd; } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, +static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count, gfp_t gfp_mask) { if (iov_count > UIO_MAXIOV) @@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (offset) nr_pages++; - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); + bmd = bio_alloc_map_data(iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); @@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; @@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - bs->bvec_pool = biovec_create_pool(bs, pool_size); + bs->bvec_pool = biovec_create_pool(pool_size); if (!bs->bvec_pool) goto bad; diff --git a/block/blk-core.c b/block/blk-core.c index a0e3096c4bb5..d87be5b4e554 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg) printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); @@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->delayed_work); + queue_for_each_hw_ctx(q, hctx, i) { + cancel_delayed_work_sync(&hctx->run_work); + cancel_delayed_work_sync(&hctx->delay_work); + } } else { cancel_delayed_work_sync(&q->delay_work); } @@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; - if (percpu_counter_init(&q->mq_usage_counter, 0)) - goto fail_q; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_c; + goto fail_q; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -637,8 +636,6 @@ fail_bdi: bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); -fail_c: - percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags) __freed_request(rl, sync ^ 1); } +int blk_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct request_list *rl; + + spin_lock_irq(q->queue_lock); + q->nr_requests = nr; + blk_queue_congestion_threshold(q); + + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + + spin_unlock_irq(q->queue_lock); + return 0; +} + /* * Determine if elevator data should be initialized when allocating the * request associated with @bio. @@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask); + return blk_mq_alloc_request(q, rw, gfp_mask, false); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq, static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { + int inflight; + if (now == part->stamp) return; - if (part_in_flight(part)) { + inflight = part_in_flight(part); + if (inflight) { __part_stat_add(cpu, part, time_in_queue, - part_in_flight(part) * (now - part->stamp)); + inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page, rq->__data_len = rq->resid_len = len; rq->nr_phys_segments = 1; - rq->buffer = bio_data(bio); } EXPORT_SYMBOL_GPL(blk_add_request_payload); @@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bio->bi_next = req->bio; req->bio = bio; - /* - * may not be valid. if the low level driver said - * it didn't need a bounce buffer then it better - * not touch req->buffer either... - */ - req->buffer = bio_data(bio); req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); @@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count) @@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool ret = false; struct list_head *plug_list; - if (blk_queue_nomerges(q)) - goto out; - plug = current->plug; if (!plug) goto out; @@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (blk_attempt_plug_merge(q, bio, &request_count)) + if (!blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); @@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); @@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) } req->__data_len -= total_bytes; - req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (req->cmd_type == REQ_TYPE_FS) @@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); /* * queue lock must be held */ -static void blk_finish_request(struct request *req, int error) +void blk_finish_request(struct request *req, int error) { if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error) __blk_put_request(req->q, req); } } +EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request @@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ rq->cmd_flags |= bio->bi_rw & REQ_WRITE; - if (bio_has_data(bio)) { + if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); - rq->buffer = bio_data(bio); - } + rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; @@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. + * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { @@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src) * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) + * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. @@ -2904,19 +2937,26 @@ free_and_out: } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) +int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); -int kblockd_schedule_delayed_work(struct request_queue *q, - struct delayed_work *dwork, unsigned long delay) +int kblockd_schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work(kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_schedule_delayed_work); +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); + #define PLUG_MAGIC 0x91827364 /** diff --git a/block/blk-flush.c b/block/blk-flush.c index 43e6b4755e9a..ef608b35d9be 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_run(struct work_struct *work) -{ - struct request *rq; - - rq = container_of(work, struct request, mq_flush_work); - - memset(&rq->csd, 0, sizeof(rq->csd)); - blk_mq_insert_request(rq, false, true, false); -} - static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - INIT_WORK(&rq->mq_flush_work, mq_flush_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_work); + struct request_queue *q = rq->q; + + blk_mq_add_to_requeue_list(rq, add_front); + blk_mq_kick_requeue_list(q); return false; } else { if (add_front) @@ -306,23 +298,9 @@ static bool blk_kick_flush(struct request_queue *q) */ q->flush_pending_idx ^= 1; - if (q->mq_ops) { - struct blk_mq_ctx *ctx = first_rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - - blk_mq_rq_init(hctx, q->flush_rq); - q->flush_rq->mq_ctx = ctx; - - /* - * Reuse the tag value from the fist waiting request, - * with blk-mq the tag is generated during request - * allocation and drivers can rely on it being inside - * the range they asked for. - */ - q->flush_rq->tag = first_rq->tag; - } else { - blk_rq_init(q, q->flush_rq); - } + blk_rq_init(q, q->flush_rq); + if (q->mq_ops) + blk_mq_clone_flush_request(q->flush_rq, first_rq); q->flush_rq->cmd_type = REQ_TYPE_FS; q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c index c11d24e379e2..d828b44a404b 100644 --- a/block/blk-iopoll.c +++ b/block/blk-iopoll.c @@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete); * iopoll handler will not be invoked again before blk_iopoll_sched_prep() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iopoll) +void blk_iopoll_complete(struct blk_iopoll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iopoll); + __blk_iopoll_complete(iop); local_irq_restore(flags); } EXPORT_SYMBOL(blk_iopoll_complete); diff --git a/block/blk-lib.c b/block/blk-lib.c index 97a733cf3d5f..8411be3c19d3 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same); * Generate and issue number of bios with zerofiled pages. */ -int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) { int ret; struct bio *bio; diff --git a/block/blk-map.c b/block/blk-map.c index f7b22bc21518..f890d4345b0c 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - rq->buffer = NULL; return 0; unmap_rq: blk_rq_unmap_user(bio); @@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, } blk_queue_bounce(q, &rq->bio); - rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c index 136ef8643bba..bb3ed488f7b5 100644 --- a/block/blk-mq-cpu.c +++ b/block/blk-mq-cpu.c @@ -1,3 +1,8 @@ +/* + * CPU notifier helper code for blk-mq + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self, { unsigned int cpu = (unsigned long) hcpu; struct blk_mq_cpu_notifier *notify; + int ret = NOTIFY_OK; raw_spin_lock(&blk_mq_cpu_notify_lock); - list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) - notify->notify(notify->data, action, cpu); + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) { + ret = notify->notify(notify->data, action, cpu); + if (ret != NOTIFY_OK) + break; + } raw_spin_unlock(&blk_mq_cpu_notify_lock); - return NOTIFY_OK; + return ret; } void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) @@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) } void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data) { notifier->notify = fn; diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 097921329619..1065d7c65fa1 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -1,3 +1,8 @@ +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/module.h> @@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) return 0; } -unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) +unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) { unsigned int *map; /* If cpus are offline, map them to first hctx */ map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, - reg->numa_node); + set->numa_node); if (!map) return NULL; - if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) + if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) return map; kfree(map); return NULL; } + +/* + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index b0ba264b0522..99a60a829e69 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx, return ret; } -static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) -{ - ssize_t ret; - - spin_lock(&hctx->lock); - ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI)); - spin_unlock(&hctx->lock); - - return ret; -} - -static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx, - const char *page, size_t len) +static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { - struct blk_mq_ctx *ctx; - unsigned long ret; - unsigned int i; - - if (kstrtoul(page, 10, &ret)) { - pr_err("blk-mq-sysfs: invalid input '%s'\n", page); - return -EINVAL; - } - - spin_lock(&hctx->lock); - if (ret) - hctx->flags |= BLK_MQ_F_SHOULD_IPI; - else - hctx->flags &= ~BLK_MQ_F_SHOULD_IPI; - spin_unlock(&hctx->lock); - - hctx_for_each_ctx(hctx, ctx, i) - ctx->ipi_redirect = !!ret; - - return len; + return blk_mq_tag_sysfs_show(hctx->tags, page); } -static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) +static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page) { - return blk_mq_tag_sysfs_show(hctx->tags, page); + return sprintf(page, "%u\n", atomic_read(&hctx->nr_active)); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { - unsigned int i, queue_num, first = 1; + unsigned int i, first = 1; ssize_t ret = 0; blk_mq_disable_hotplug(); - for_each_online_cpu(i) { - queue_num = hctx->queue->mq_map[i]; - if (queue_num != hctx->queue_num) - continue; - + for_each_cpu(i, hctx->cpumask) { if (first) ret += sprintf(ret + page, "%u", i); else @@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_dispatched_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = { + .attr = {.name = "active", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_active_show, +}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { .attr = {.name = "pending", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_rq_list_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = { - .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR}, - .show = blk_mq_hw_sysfs_ipi_show, - .store = blk_mq_hw_sysfs_ipi_store, -}; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { .attr = {.name = "tags", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_tags_show, @@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_run.attr, &blk_mq_hw_sysfs_dispatched.attr, &blk_mq_hw_sysfs_pending.attr, - &blk_mq_hw_sysfs_ipi.attr, &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, + &blk_mq_hw_sysfs_active.attr, NULL, }; diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 83ae96c51a27..d90c4aeb7dd3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,78 +1,345 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/percpu_ida.h> +#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline void bt_index_inc(unsigned int *index) +{ + *index = (*index + 1) & (BT_WAIT_QUEUES - 1); +} + /* - * Per tagged queue (tag address space) map + * If a previously inactive queue goes active, bump the active user count. */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - unsigned int nr_batch_move; - unsigned int nr_max_cache; +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); - struct percpu_ida free_tags; - struct percpu_ida reserved_tags; -}; + return true; +} -void blk_mq_wait_for_tags(struct blk_mq_tags *tags) +/* + * Wakeup all potentially sleeping on normal (non-reserved) tags + */ +static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) { - int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); - blk_mq_put_tag(tags, tag); + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + bt = &tags->bitmap_tags; + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + bt_index_inc(&wake_index); + } } -bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { - return !tags || - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; + int tag, org_last_tag, end; + + org_last_tag = last_tag; + end = bm->depth; + do { +restart: + tag = find_next_zero_bit(&bm->word, end, last_tag); + if (unlikely(tag >= end)) { + /* + * We started with an offset, start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag) { + end = last_tag; + last_tag = 0; + goto restart; + } + return -1; + } + last_tag = tag + 1; + } while (test_and_set_bit_lock(tag, &bm->word)); + + return tag; } -static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache) { + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + last_tag = 0; + if (++index >= bt->map_nr) + index = 0; + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + + if (!hctx) + return &bt->bs[0]; + + bs = &bt->bs[hctx->wait_index]; + bt_index_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); int tag; - tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); - if (tag < 0) - return BLK_MQ_TAG_FAIL; - return tag + tags->nr_reserved_tags; + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + return tag; + + if (!(gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + bool was_empty; + + was_empty = list_empty(&wait.task_list); + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + + if (was_empty) + atomic_set(&bs->wait_cnt, bt->wake_cnt); + + io_schedule(); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, gfp_t gfp) +{ + int tag; + + tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp); + if (tag >= 0) + return tag + tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; } static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, gfp_t gfp) { - int tag; + int tag, zero = 0; if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? - TASK_UNINTERRUPTIBLE : TASK_RUNNING); + tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp); if (tag < 0) return BLK_MQ_TAG_FAIL; + return tag; } -unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) +unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, + gfp_t gfp, bool reserved) { if (!reserved) - return __blk_mq_get_tag(tags, gfp); + return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp); - return __blk_mq_get_reserved_tag(tags, gfp); + return __blk_mq_get_reserved_tag(hctx->tags, gfp); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = bt->wake_index; + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + if (wake_index != bt->wake_index) + bt->wake_index = wake_index; + + return bs; + } + + bt_index_inc(&wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + + /* + * The unlock memory barrier need to order access to req in free + * path and clearing tag bit + */ + clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + bs = bt_wake_ptr(bt); + if (bs && atomic_dec_and_test(&bs->wait_cnt)) { + atomic_set(&bs->wait_cnt, bt->wake_cnt); + bt_index_inc(&bt->wake_index); + wake_up(&bs->wait); + } } static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) { BUG_ON(tag >= tags->nr_tags); - percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); + bt_clear_tag(&tags->bitmap_tags, tag); } static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, @@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, { BUG_ON(tag >= tags->nr_reserved_tags); - percpu_ida_free(&tags->reserved_tags, tag); + bt_clear_tag(&tags->breserved_tags, tag); } -void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) { - if (tag >= tags->nr_reserved_tags) - __blk_mq_put_tag(tags, tag); - else + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + __blk_mq_put_tag(tags, real_tag); + *last_tag = real_tag; + } else __blk_mq_put_reserved_tag(tags, tag); } -static int __blk_mq_tag_iter(unsigned id, void *data) +static void bt_for_each_free(struct blk_mq_bitmap_tags *bt, + unsigned long *free_map, unsigned int off) { - unsigned long *tag_map = data; - __set_bit(id, tag_map); - return 0; + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int bit = 0; + + do { + bit = find_next_zero_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + __set_bit(bit + off, free_map); + bit++; + } while (1); + + off += (1 << bt->bits_per_word); + } } void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, @@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, if (!tag_map) return; - percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); + bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags); if (tags->nr_reserved_tags) - percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, - tag_map); + bt_for_each_free(&tags->breserved_tags, tag_map, 0); fn(data, tag_map); kfree(tag_map); } +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / 4) + bt->wake_cnt = max(1U, depth / 4); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + return -ENOMEM; + } + + for (i = 0; i < BT_WAIT_QUEUES; i++) + init_waitqueue_head(&bt->bs[i].wait); + + bt_update_count(bt, depth); + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node) { - unsigned int nr_tags, nr_cache; struct blk_mq_tags *tags; - int ret; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); @@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; - nr_tags = total_tags - reserved_tags; - nr_cache = nr_tags / num_possible_cpus(); - - if (nr_cache < BLK_MQ_TAG_CACHE_MIN) - nr_cache = BLK_MQ_TAG_CACHE_MIN; - else if (nr_cache > BLK_MQ_TAG_CACHE_MAX) - nr_cache = BLK_MQ_TAG_CACHE_MAX; - tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; - tags->nr_max_cache = nr_cache; - tags->nr_batch_move = max(1u, nr_cache / 2); - ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - - tags->nr_reserved_tags, - tags->nr_max_cache, - tags->nr_batch_move); - if (ret) - goto err_free_tags; + return blk_mq_init_bitmap_tags(tags, node); +} - if (reserved_tags) { - /* - * With max_cahe and batch set to 1, the allocator fallbacks to - * no cached. It's fine reserved tags allocation is slow. - */ - ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, - 1, 1); - if (ret) - goto err_reserved_tags; - } +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); + kfree(tags); +} - return tags; +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; -err_reserved_tags: - percpu_ida_destroy(&tags->free_tags); -err_free_tags: - kfree(tags); - return NULL; + *tag = prandom_u32() % depth; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { - percpu_ida_destroy(&tags->free_tags); - percpu_ida_destroy(&tags->reserved_tags); - kfree(tags); + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags); + return 0; } ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; - unsigned int cpu; + unsigned int free, res; if (!tags) return 0; - page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," - " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->nr_batch_move, tags->nr_max_cache); + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); - page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", - percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), - percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids)); + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); - for_each_possible_cpu(cpu) { - page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, - percpu_ida_free_tags(&tags->free_tags, cpu)); - } + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); return page - orig_page; } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 947ba2c6148e..c959de58d2a5 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -1,17 +1,59 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -struct blk_mq_tags; +#include "blk-mq.h" + +enum { + BT_WAIT_QUEUES = 8, + BT_WAIT_BATCH = 8, +}; + +struct bt_wait_state { + atomic_t wait_cnt; + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) +#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) + +struct blk_mq_bitmap_tags { + unsigned int depth; + unsigned int wake_cnt; + unsigned int bits_per_word; + + unsigned int map_nr; + struct blk_align_bitmap *map; + + unsigned int wake_index; + struct bt_wait_state *bs; +}; + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct blk_mq_bitmap_tags bitmap_tags; + struct blk_mq_bitmap_tags breserved_tags; + + struct request **rqs; + struct list_head page_list; +}; + extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); extern void blk_mq_free_tags(struct blk_mq_tags *tags); -extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); -extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag); -extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); +extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); +extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); +extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); enum { BLK_MQ_TAG_CACHE_MIN = 1, @@ -24,4 +66,23 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, }; +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 1d2a9bdbee57..f27fe44230c2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1,3 +1,9 @@ +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> @@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { unsigned int i; - for (i = 0; i < hctx->nr_ctx_map; i++) - if (hctx->ctx_map[i]) + for (i = 0; i < hctx->ctx_map.map_size; i++) + if (hctx->ctx_map.map[i].word) return true; return false; } +static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; +} + +#define CTX_TO_BIT(hctx, ctx) \ + ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) + /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!test_bit(ctx->index_hw, hctx->ctx_map)) - set_bit(ctx->index_hw, hctx->ctx_map); + struct blk_align_bitmap *bm = get_bm(hctx, ctx); + + if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) + set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) { - struct request *rq; - unsigned int tag; + struct blk_align_bitmap *bm = get_bm(hctx, ctx); - tag = blk_mq_get_tag(hctx->tags, gfp, reserved); - if (tag != BLK_MQ_TAG_FAIL) { - rq = hctx->rqs[tag]; - rq->tag = tag; - - return rq; - } - - return NULL; + clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } static int blk_mq_queue_enter(struct request_queue *q) @@ -186,78 +194,109 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = q; rq->mq_ctx = ctx; - rq->cmd_flags = rw_flags; + rq->cmd_flags |= rw_flags; + rq->cmd_type = 0; + /* do not touch atomic flags, it needs atomic ops against the timer */ + rq->cpu = -1; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = NULL; + rq->biotail = NULL; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv))); + rq->rq_disk = NULL; + rq->part = NULL; rq->start_time = jiffies; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + rq->ioprio = 0; + rq->special = NULL; + /* tag was already set */ + rq->errors = 0; + memset(rq->__cmd, 0, sizeof(rq->__cmd)); + rq->cmd = rq->__cmd; + rq->cmd_len = BLK_MAX_CDB; + + rq->extra_len = 0; + rq->sense_len = 0; + rq->resid_len = 0; + rq->sense = NULL; + + rq->deadline = 0; + INIT_LIST_HEAD(&rq->timeout_list); + rq->timeout = 0; + rq->retries = 0; + rq->end_io = NULL; + rq->end_io_data = NULL; + rq->next_rq = NULL; + ctx->rq_dispatched[rw_is_sync(rw_flags)]++; } -static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, - int rw, gfp_t gfp, - bool reserved) +static struct request * +__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved) { struct request *rq; + unsigned int tag; - do { - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved); + if (tag != BLK_MQ_TAG_FAIL) { + rq = hctx->tags->rqs[tag]; - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); - if (rq) { - blk_mq_rq_ctx_init(q, ctx, rq, rw); - break; + rq->cmd_flags = 0; + if (blk_mq_tag_busy(hctx)) { + rq->cmd_flags = REQ_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); } - blk_mq_put_ctx(ctx); - if (!(gfp & __GFP_WAIT)) - break; - - __blk_mq_run_hw_queue(hctx); - blk_mq_wait_for_tags(hctx->tags); - } while (1); + rq->tag = tag; + blk_mq_rq_ctx_init(q, ctx, rq, rw); + return rq; + } - return rq; + return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, + bool reserved) { + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); - return rq; -} - -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, - gfp_t gfp) -{ - struct request *rq; + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); - if (blk_mq_queue_enter(q)) - return NULL; + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT, + reserved); + if (!rq && (gfp & __GFP_WAIT)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); - rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); - if (rq) - blk_mq_put_ctx(rq->mq_ctx); + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved); + } + blk_mq_put_ctx(ctx); return rq; } -EXPORT_SYMBOL(blk_mq_alloc_reserved_request); - -/* - * Re-init and set pdu, if we have it - */ -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - blk_rq_init(hctx->queue, rq); - - if (hctx->cmd_size) - rq->special = blk_mq_rq_to_pdu(rq); -} +EXPORT_SYMBOL(blk_mq_alloc_request); static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq) @@ -265,9 +304,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - blk_mq_rq_init(hctx, rq); - blk_mq_put_tag(hctx->tags, tag); + if (rq->cmd_flags & REQ_MQ_INFLIGHT) + atomic_dec(&hctx->nr_active); + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + blk_mq_put_tag(hctx, tag, &ctx->last_tag); blk_mq_queue_exit(q); } @@ -283,20 +324,47 @@ void blk_mq_free_request(struct request *rq) __blk_mq_free_request(hctx, ctx, rq); } -bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) +/* + * Clone all relevant state from a request that has been put on hold in + * the flush state machine into the preallocated flush request that hangs + * off the request queue. + * + * For a driver the flush request should be invisible, that's why we are + * impersonating the original request here. + */ +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq) { - if (blk_update_request(rq, error, blk_rq_bytes(rq))) - return true; + struct blk_mq_hw_ctx *hctx = + orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu); + flush_rq->mq_ctx = orig_rq->mq_ctx; + flush_rq->tag = orig_rq->tag; + memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq), + hctx->cmd_size); +} + +inline void __blk_mq_end_io(struct request *rq, int error) +{ blk_account_io_done(rq); - if (rq->end_io) + if (rq->end_io) { rq->end_io(rq, error); - else + } else { + if (unlikely(blk_bidi_rq(rq))) + blk_mq_free_request(rq->next_rq); blk_mq_free_request(rq); - return false; + } } -EXPORT_SYMBOL(blk_mq_end_io_partial); +EXPORT_SYMBOL(__blk_mq_end_io); + +void blk_mq_end_io(struct request *rq, int error) +{ + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + BUG(); + __blk_mq_end_io(rq, error); +} +EXPORT_SYMBOL(blk_mq_end_io); static void __blk_mq_complete_request_remote(void *data) { @@ -308,15 +376,19 @@ static void __blk_mq_complete_request_remote(void *data) void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + bool shared = false; int cpu; - if (!ctx->ipi_redirect) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); - if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + shared = cpus_share_cache(cpu, ctx->cpu); + + if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; @@ -337,10 +409,16 @@ void __blk_mq_complete_request(struct request *rq) **/ void blk_mq_complete_request(struct request *rq) { - if (unlikely(blk_should_fake_timeout(rq->q))) + struct request_queue *q = rq->q; + + if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) - __blk_mq_complete_request(rq); + if (!blk_mark_rq_complete(rq)) { + if (q->softirq_done_fn) + __blk_mq_complete_request(rq); + else + blk_mq_end_io(rq, rq->errors); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -350,13 +428,29 @@ static void blk_mq_start_request(struct request *rq, bool last) trace_block_rq_issue(q, rq); + rq->resid_len = blk_rq_bytes(rq); + if (unlikely(blk_bidi_rq(rq))) + rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + /* * Just mark start time and set the started bit. Due to memory * ordering, we know we'll see the correct deadline as long as - * REQ_ATOMIC_STARTED is seen. + * REQ_ATOMIC_STARTED is seen. Use the default queue timeout, + * unless one has been set in the request. + */ + if (!rq->timeout) + rq->deadline = jiffies + q->rq_timeout; + else + rq->deadline = jiffies + rq->timeout; + + /* + * Mark us as started and clear complete. Complete might have been + * set if requeue raced with timeout, which then marked it as + * complete. So be sure to clear complete again when we start + * the request, otherwise we'll ignore the completion event. */ - rq->deadline = jiffies + q->rq_timeout; set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -378,7 +472,7 @@ static void blk_mq_start_request(struct request *rq, bool last) rq->cmd_flags |= REQ_END; } -static void blk_mq_requeue_request(struct request *rq) +static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -391,6 +485,80 @@ static void blk_mq_requeue_request(struct request *rq) rq->nr_phys_segments--; } +void blk_mq_requeue_request(struct request *rq) +{ + __blk_mq_requeue_request(rq); + blk_clear_rq_complete(rq); + + BUG_ON(blk_queued_rq(rq)); + blk_mq_add_to_requeue_list(rq, true); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work); + LIST_HEAD(rq_list); + struct request *rq, *next; + unsigned long flags; + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + continue; + + rq->cmd_flags &= ~REQ_SOFTBARRIER; + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, false, false, false); + } + + blk_mq_run_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertation from the workqueue. + */ + BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->cmd_flags |= REQ_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); +} +EXPORT_SYMBOL(blk_mq_add_to_requeue_list); + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_schedule_work(&q->requeue_work); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ + return tags->rqs[tag]; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); + struct blk_mq_timeout_data { struct blk_mq_hw_ctx *hctx; unsigned long *next; @@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) do { struct request *rq; - tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); - if (tag >= hctx->queue_depth) + tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag); + if (tag >= hctx->tags->nr_tags) break; - rq = hctx->rqs[tag++]; - + rq = blk_mq_tag_to_rq(hctx->tags, tag++); + if (rq->q != hctx->queue) + continue; if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) continue; @@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); } +static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq) +{ + struct request_queue *q = rq->q; + + /* + * We know that complete is set at this point. If STARTED isn't set + * anymore, then the request isn't active and the "timeout" should + * just be ignored. This can happen due to the bitflag ordering. + * Timeout first checks if STARTED is set, and if it is, assumes + * the request is active. But if we race with completion, then + * we both flags will get cleared. So check here again, and ignore + * a timeout event with a request that isn't active. + */ + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + return BLK_EH_NOT_HANDLED; + + if (!q->mq_ops->timeout) + return BLK_EH_RESET_TIMER; + + return q->mq_ops->timeout(rq); +} + static void blk_mq_rq_timer(unsigned long data) { struct request_queue *q = (struct request_queue *) data; @@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data) unsigned long next = 0; int i, next_set = 0; - queue_for_each_hw_ctx(q, hctx, i) + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!hctx->nr_ctx || !hctx->tags) + continue; + blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); + } - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); + if (next_set) { + next = blk_rq_timeout(round_jiffies_up(next)); + mod_timer(&q->timeout, next); + } else { + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_tag_idle(hctx); + } } /* @@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } -void blk_mq_add_timer(struct request *rq) +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - __blk_add_timer(rq, NULL); + struct blk_mq_ctx *ctx; + int i; + + for (i = 0; i < hctx->ctx_map.map_size; i++) { + struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; + unsigned int off, bit; + + if (!bm->word) + continue; + + bit = 0; + off = i * hctx->ctx_map.bits_per_word; + do { + bit = find_next_bit(&bm->word, bm->depth, bit); + if (bit >= bm->depth) + break; + + ctx = hctx->ctxs[bit + off]; + clear_bit(bit, &bm->word); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, list); + spin_unlock(&ctx->lock); + + bit++; + } while (1); + } } /* @@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq) static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct blk_mq_ctx *ctx; struct request *rq; LIST_HEAD(rq_list); - int bit, queued; + int queued; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; @@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * Touch any software queue that has pending entries. */ - for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { - clear_bit(bit, hctx->ctx_map); - ctx = hctx->ctxs[bit]; - BUG_ON(bit != ctx->index_hw); - - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, &rq_list); - spin_unlock(&ctx->lock); - } + flush_busy_ctxs(hctx, &rq_list); /* * If we have previous entries on our dispatch list, grab them @@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* - * Delete and return all entries from our dispatch list - */ - queued = 0; - - /* * Now process all the entries, sending them to the driver. */ + queued = 0; while (!list_empty(&rq_list)) { int ret; @@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) queued++; continue; case BLK_MQ_RQ_QUEUE_BUSY: - /* - * FIXME: we should have a mechanism to stop the queue - * like blk_stop_queue, otherwise we will waste cpu - * time - */ list_add(&rq->queuelist, &rq_list); - blk_mq_requeue_request(rq); + __blk_mq_requeue_request(rq); break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); @@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = hctx->next_cpu; + + if (--hctx->next_cpu_batch <= 0) { + int next_cpu; + + next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(hctx->cpumask); + + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + return cpu; +} + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return; - if (!async) + if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) __blk_mq_run_hw_queue(hctx); + else if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->run_work, 0); else { - struct request_queue *q = hctx->queue; + unsigned int cpu; - kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); } } @@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; + preempt_disable(); blk_mq_run_hw_queue(hctx, async); + preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->delayed_work); + cancel_delayed_work(&hctx->run_work); + cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); @@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + + preempt_disable(); __blk_mq_run_hw_queue(hctx); + preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); -void blk_mq_start_stopped_hw_queues(struct request_queue *q) +void blk_mq_start_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; @@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, true); + preempt_disable(); + blk_mq_run_hw_queue(hctx, async); + preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -static void blk_mq_work_fn(struct work_struct *work) +static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + __blk_mq_run_hw_queue(hctx); } +static void blk_mq_delay_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); + + if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) + __blk_mq_run_hw_queue(hctx); +} + +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + unsigned long tmo = msecs_to_jiffies(msecs); + + if (hctx->queue->nr_hw_queues == 1) + kblockd_schedule_delayed_work(&hctx->delay_work, tmo); + else { + unsigned int cpu; + + cpu = blk_mq_hctx_next_cpu(hctx); + kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); + } +} +EXPORT_SYMBOL(blk_mq_delay_queue); + static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { @@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); + blk_mq_hctx_mark_pending(hctx, ctx); /* * We do this early, to ensure we are on the right CPU. */ - blk_mq_add_timer(rq); + blk_add_timer(rq); } void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, @@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, spin_unlock(&ctx->lock); } - blk_mq_put_ctx(current_ctx); - if (run_queue) blk_mq_run_hw_queue(hctx, async); + + blk_mq_put_ctx(current_ctx); } static void blk_mq_insert_requests(struct request_queue *q, @@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q, } spin_unlock(&ctx->lock); - blk_mq_put_ctx(current_ctx); - blk_mq_run_hw_queue(hctx, from_schedule); + blk_mq_put_ctx(current_ctx); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -826,21 +1115,161 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, struct bio *bio) { + struct request_queue *q = hctx->queue; + + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + blk_mq_bio_to_request(rq, bio); + spin_lock(&ctx->lock); +insert_rq: + __blk_mq_insert_request(hctx, rq, false); + spin_unlock(&ctx->lock); + return false; + } else { + spin_lock(&ctx->lock); + if (!blk_mq_attempt_merge(q, ctx, bio)) { + blk_mq_bio_to_request(rq, bio); + goto insert_rq; + } + + spin_unlock(&ctx->lock); + __blk_mq_free_request(hctx, ctx, rq); + return true; + } +} + +struct blk_map_ctx { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; +}; + +static struct request *blk_mq_map_request(struct request_queue *q, + struct bio *bio, + struct blk_map_ctx *data) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct request *rq; + int rw = bio_data_dir(bio); + + if (unlikely(blk_mq_queue_enter(q))) { + bio_endio(bio, -EIO); + return NULL; + } + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + if (rw_is_sync(bio->bi_rw)) + rw |= REQ_SYNC; + + trace_block_getrq(q, bio, rw); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false); + if (unlikely(!rq)) { + __blk_mq_run_hw_queue(hctx); + blk_mq_put_ctx(ctx); + trace_block_sleeprq(q, bio, rw); + + ctx = blk_mq_get_ctx(q); + hctx = q->mq_ops->map_queue(q, ctx->cpu); + rq = __blk_mq_alloc_request(q, hctx, ctx, rw, + __GFP_WAIT|GFP_ATOMIC, false); + } + + hctx->queued++; + data->hctx = hctx; + data->ctx = ctx; + return rq; +} + +/* + * Multiple hardware queue variant. This will not use per-process plugs, + * but will attempt to bypass the hctx queueing if we can go straight to + * hardware for SYNC IO. + */ +static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +{ const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - int rw = bio_data_dir(bio); + struct blk_map_ctx data; struct request *rq; + + blk_queue_bounce(q, &bio); + + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + + rq = blk_mq_map_request(q, bio, &data); + if (unlikely(!rq)) + return; + + if (unlikely(is_flush_fua)) { + blk_mq_bio_to_request(rq, bio); + blk_insert_flush(rq); + goto run_queue; + } + + if (is_sync) { + int ret; + + blk_mq_bio_to_request(rq, bio); + blk_mq_start_request(rq, true); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(data.hctx, rq); + if (ret == BLK_MQ_RQ_QUEUE_OK) + goto done; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_io(rq, rq->errors); + goto done; + } + } + } + + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); + } +done: + blk_mq_put_ctx(data.ctx); +} + +/* + * Single hardware queue variant. This will attempt to use any per-process + * plug for merging and IO deferral. + */ +static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +{ + const int is_sync = rw_is_sync(bio->bi_rw); + const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); unsigned int use_plug, request_count = 0; + struct blk_map_ctx data; + struct request *rq; /* * If we have multiple hardware queues, just go directly to * one of those for sync IO. */ - use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); + use_plug = !is_flush_fua && !is_sync; blk_queue_bounce(q, &bio); @@ -849,37 +1278,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) - return; - - if (blk_mq_queue_enter(q)) { - bio_endio(bio, -EIO); + if (use_plug && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) return; - } - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - - if (is_sync) - rw |= REQ_SYNC; - trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); - if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); - else { - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); - ctx = rq->mq_ctx; - hctx = q->mq_ops->map_queue(q, ctx->cpu); - } - - hctx->queued++; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); - blk_mq_put_ctx(ctx); blk_insert_flush(rq); goto run_queue; } @@ -901,31 +1307,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(ctx); + blk_mq_put_ctx(data.ctx); return; } } - spin_lock(&ctx->lock); - - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - blk_mq_attempt_merge(q, ctx, bio)) - __blk_mq_free_request(hctx, ctx, rq); - else { - blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq, false); + if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* + * For a SYNC request, send it to the hardware immediately. For + * an ASYNC request, just ensure that we run it later on. The + * latter allows for merging opportunities and more efficient + * dispatching. + */ +run_queue: + blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } - spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); - - /* - * For a SYNC request, send it to the hardware immediately. For an - * ASYNC request, just ensure that we run it later on. The latter - * allows for merging opportunities and more efficient dispatching. - */ -run_queue: - blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); + blk_mq_put_ctx(data.ctx); } /* @@ -937,32 +1335,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) } EXPORT_SYMBOL(blk_mq_map_queue); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, - unsigned int hctx_index) +static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) { - return kmalloc_node(sizeof(struct blk_mq_hw_ctx), - GFP_KERNEL | __GFP_ZERO, reg->numa_node); + struct page *page; + + if (tags->rqs && set->ops->exit_request) { + int i; + + for (i = 0; i < tags->nr_tags; i++) { + if (!tags->rqs[i]) + continue; + set->ops->exit_request(set->driver_data, tags->rqs[i], + hctx_idx, i); + } + } + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + __free_pages(page, page->private); + } + + kfree(tags->rqs); + + blk_mq_free_tags(tags); } -EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); -void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, - unsigned int hctx_index) +static size_t order_to_size(unsigned int order) { - kfree(hctx); + return (size_t)PAGE_SIZE << order; } -EXPORT_SYMBOL(blk_mq_free_single_hw_queue); -static void blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + struct blk_mq_tags *tags; + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + + tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, + set->numa_node); + if (!tags) + return NULL; + + INIT_LIST_HEAD(&tags->page_list); + + tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *), + GFP_KERNEL, set->numa_node); + if (!tags->rqs) { + blk_mq_free_tags(tags); + return NULL; + } + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + set->cmd_size, + cache_line_size()); + left = rq_size * set->queue_depth; + + for (i = 0; i < set->queue_depth; ) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (left < order_to_size(this_order - 1) && this_order) + this_order--; + + do { + page = alloc_pages_node(set->numa_node, GFP_KERNEL, + this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + goto fail; + + page->private = this_order; + list_add_tail(&page->lru, &tags->page_list); + + p = page_address(page); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, set->queue_depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + tags->rqs[i] = p; + if (set->ops->init_request) { + if (set->ops->init_request(set->driver_data, + tags->rqs[i], hctx_idx, i, + set->numa_node)) + goto fail; + } + + p += rq_size; + i++; + } + } + + return tags; + +fail: + pr_warn("%s: failed to allocate requests\n", __func__); + blk_mq_free_rq_map(set, tags, hctx_idx); + return NULL; +} + +static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) +{ + kfree(bitmap->map); +} + +static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) +{ + unsigned int bpw = 8, total, num_maps, i; + + bitmap->bits_per_word = bpw; + + num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; + bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bitmap->map) + return -ENOMEM; + + bitmap->map_size = num_maps; + + total = nr_cpu_ids; + for (i = 0; i < num_maps; i++) { + bitmap->map[i].depth = min(total, bitmap->bits_per_word); + total -= bitmap->map[i].depth; + } + + return 0; +} + +static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) { - struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return; - /* * Move ctx entries to new CPU, if this one is going away. */ @@ -971,12 +1490,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_list)) { list_splice_init(&ctx->rq_list, &tmp); - clear_bit(ctx->index_hw, hctx->ctx_map); + blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return; + return NOTIFY_OK; ctx = blk_mq_get_ctx(q); spin_lock(&ctx->lock); @@ -993,210 +1512,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action, blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - blk_mq_put_ctx(ctx); blk_mq_run_hw_queue(hctx, true); + blk_mq_put_ctx(ctx); + return NOTIFY_OK; } -static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) { - unsigned int i; - int ret = 0; - - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; - - ret = init(data, hctx, rq, i); - if (ret) - break; - } - - return ret; -} + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; -int blk_mq_init_commands(struct request_queue *q, - int (*init)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - int ret = 0; + if (set->tags[hctx->queue_num]) + return NOTIFY_OK; - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_init_hw_commands(hctx, init, data); - if (ret) - break; - } + set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); + if (!set->tags[hctx->queue_num]) + return NOTIFY_STOP; - return ret; + hctx->tags = set->tags[hctx->queue_num]; + return NOTIFY_OK; } -EXPORT_SYMBOL(blk_mq_init_commands); -static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static int blk_mq_hctx_notify(void *data, unsigned long action, + unsigned int cpu) { - unsigned int i; + struct blk_mq_hw_ctx *hctx = data; - for (i = 0; i < hctx->queue_depth; i++) { - struct request *rq = hctx->rqs[i]; + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + return blk_mq_hctx_cpu_offline(hctx, cpu); + else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + return blk_mq_hctx_cpu_online(hctx, cpu); - free(data, hctx, rq, i); - } + return NOTIFY_OK; } -void blk_mq_free_commands(struct request_queue *q, - void (*free)(void *, struct blk_mq_hw_ctx *, - struct request *, unsigned int), - void *data) +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned int i; - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_free_hw_commands(hctx, free, data); -} -EXPORT_SYMBOL(blk_mq_free_commands); + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; -static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) -{ - struct page *page; + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, i); - while (!list_empty(&hctx->page_list)) { - page = list_first_entry(&hctx->page_list, struct page, lru); - list_del_init(&page->lru); - __free_pages(page, page->private); + blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + kfree(hctx->ctxs); + blk_mq_free_bitmap(&hctx->ctx_map); } - kfree(hctx->rqs); - - if (hctx->tags) - blk_mq_free_tags(hctx->tags); -} - -static size_t order_to_size(unsigned int order) -{ - size_t ret = PAGE_SIZE; - - while (order--) - ret *= 2; - - return ret; } -static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, - unsigned int reserved_tags, int node) +static void blk_mq_free_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set) { - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; - - INIT_LIST_HEAD(&hctx->page_list); - - hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), - GFP_KERNEL, node); - if (!hctx->rqs) - return -ENOMEM; - - /* - * rq_size is the size of the request plus driver payload, rounded - * to the cacheline size - */ - rq_size = round_up(sizeof(struct request) + hctx->cmd_size, - cache_line_size()); - left = rq_size * hctx->queue_depth; - - for (i = 0; i < hctx->queue_depth;) { - int this_order = max_order; - struct page *page; - int to_do; - void *p; - - while (left < order_to_size(this_order - 1) && this_order) - this_order--; - - do { - page = alloc_pages_node(node, GFP_KERNEL, this_order); - if (page) - break; - if (!this_order--) - break; - if (order_to_size(this_order) < rq_size) - break; - } while (1); - - if (!page) - break; - - page->private = this_order; - list_add_tail(&page->lru, &hctx->page_list); - - p = page_address(page); - entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, hctx->queue_depth - i); - left -= to_do * rq_size; - for (j = 0; j < to_do; j++) { - hctx->rqs[i] = p; - blk_mq_rq_init(hctx, hctx->rqs[i]); - p += rq_size; - i++; - } - } - - if (i < (reserved_tags + BLK_MQ_TAG_MIN)) - goto err_rq_map; - else if (i != hctx->queue_depth) { - hctx->queue_depth = i; - pr_warn("%s: queue depth set to %u because of low memory\n", - __func__, i); - } + struct blk_mq_hw_ctx *hctx; + unsigned int i; - hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); - if (!hctx->tags) { -err_rq_map: - blk_mq_free_rq_map(hctx); - return -ENOMEM; + queue_for_each_hw_ctx(q, hctx, i) { + free_cpumask_var(hctx->cpumask); + kfree(hctx); } - - return 0; } static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_reg *reg, void *driver_data) + struct blk_mq_tag_set *set) { struct blk_mq_hw_ctx *hctx; - unsigned int i, j; + unsigned int i; /* * Initialize hardware queues */ queue_for_each_hw_ctx(q, hctx, i) { - unsigned int num_maps; int node; node = hctx->numa_node; if (node == NUMA_NO_NODE) - node = hctx->numa_node = reg->numa_node; + node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = i; - hctx->flags = reg->flags; - hctx->queue_depth = reg->queue_depth; - hctx->cmd_size = reg->cmd_size; + hctx->flags = set->flags; + hctx->cmd_size = set->cmd_size; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); blk_mq_register_cpu_notifier(&hctx->cpu_notifier); - if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) - break; + hctx->tags = set->tags[i]; /* * Allocate space for all possible cpus to avoid allocation in @@ -1207,17 +1619,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q, if (!hctx->ctxs) break; - num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; - hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), - GFP_KERNEL, node); - if (!hctx->ctx_map) + if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) break; - hctx->nr_ctx_map = num_maps; hctx->nr_ctx = 0; - if (reg->ops->init_hctx && - reg->ops->init_hctx(hctx, driver_data, i)) + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, i)) break; } @@ -1227,17 +1635,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, /* * Init failed */ - queue_for_each_hw_ctx(q, hctx, j) { - if (i == j) - break; - - if (reg->ops->exit_hctx) - reg->ops->exit_hctx(hctx, j); - - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - blk_mq_free_rq_map(hctx); - kfree(hctx->ctxs); - } + blk_mq_exit_hw_queues(q, set, i); return 1; } @@ -1258,12 +1656,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, __ctx->queue = q; /* If the cpu isn't online, the cpu is mapped to first hctx */ - hctx = q->mq_ops->map_queue(q, i); - hctx->nr_ctx++; - if (!cpu_online(i)) continue; + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); + hctx->nr_ctx++; + /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device @@ -1280,6 +1679,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) struct blk_mq_ctx *ctx; queue_for_each_hw_ctx(q, hctx, i) { + cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; } @@ -1288,115 +1688,205 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ + if (!cpu_online(i)) + continue; + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If not software queues are mapped to this hardware queue, + * disable it and free the request entries + */ + if (!hctx->nr_ctx) { + struct blk_mq_tag_set *set = q->tag_set; + + if (set->tags[i]) { + blk_mq_free_rq_map(set, set->tags[i], i); + set->tags[i] = NULL; + hctx->tags = NULL; + } + continue; + } + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } } -struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, - void *driver_data) +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; struct request_queue *q; + bool shared; int i; - if (!reg->nr_hw_queues || - !reg->ops->queue_rq || !reg->ops->map_queue || - !reg->ops->alloc_hctx || !reg->ops->free_hctx) - return ERR_PTR(-EINVAL); + if (set->tag_list.next == set->tag_list.prev) + shared = false; + else + shared = true; - if (!reg->queue_depth) - reg->queue_depth = BLK_MQ_MAX_DEPTH; - else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { - pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); - reg->queue_depth = BLK_MQ_MAX_DEPTH; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } + blk_mq_unfreeze_queue(q); } +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + blk_mq_freeze_queue(q); + + mutex_lock(&set->tag_list_lock); + list_del_init(&q->tag_set_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); + + blk_mq_unfreeze_queue(q); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->tag_set = set; + + mutex_lock(&set->tag_list_lock); + list_add_tail(&q->tag_set_list, &set->tag_list); + blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); +} - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) - return ERR_PTR(-EINVAL); +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + struct blk_mq_hw_ctx **hctxs; + struct blk_mq_ctx *ctx; + struct request_queue *q; + unsigned int *map; + int i; ctx = alloc_percpu(struct blk_mq_ctx); if (!ctx) return ERR_PTR(-ENOMEM); - hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - reg->numa_node); + hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, + set->numa_node); if (!hctxs) goto err_percpu; - for (i = 0; i < reg->nr_hw_queues; i++) { - hctxs[i] = reg->ops->alloc_hctx(reg, i); + map = blk_mq_make_queue_map(set); + if (!map) + goto err_map; + + for (i = 0; i < set->nr_hw_queues; i++) { + int node = blk_mq_hw_queue_to_node(map, i); + + hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), + GFP_KERNEL, node); if (!hctxs[i]) goto err_hctxs; - hctxs[i]->numa_node = NUMA_NO_NODE; + if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL)) + goto err_hctxs; + + atomic_set(&hctxs[i]->nr_active, 0); + hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; } - q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); + q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!q) goto err_hctxs; - q->mq_map = blk_mq_make_queue_map(reg); - if (!q->mq_map) + if (percpu_counter_init(&q->mq_usage_counter, 0)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, 30000); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = reg->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + q->mq_map = map; q->queue_ctx = ctx; q->queue_hw_ctx = hctxs; - q->mq_ops = reg->ops; + q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; q->sg_reserved_size = INT_MAX; - blk_queue_make_request(q, blk_mq_make_request); - blk_queue_rq_timed_out(q, reg->ops->timeout); - if (reg->timeout) - blk_queue_rq_timeout(q, reg->timeout); + INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_queue_rq_timed_out(q, blk_mq_rq_timed_out); + if (set->timeout) + blk_queue_rq_timeout(q, set->timeout); - if (reg->ops->complete) - blk_queue_softirq_done(q, reg->ops->complete); + /* + * Do this after blk_queue_make_request() overrides it... + */ + q->nr_requests = set->queue_depth; + + if (set->ops->complete) + blk_queue_softirq_done(q, set->ops->complete); blk_mq_init_flush(q); - blk_mq_init_cpu_queues(q, reg->nr_hw_queues); + blk_mq_init_cpu_queues(q, set->nr_hw_queues); - q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, - cache_line_size()), GFP_KERNEL); + q->flush_rq = kzalloc(round_up(sizeof(struct request) + + set->cmd_size, cache_line_size()), + GFP_KERNEL); if (!q->flush_rq) goto err_hw; - if (blk_mq_init_hw_queues(q, reg, driver_data)) + if (blk_mq_init_hw_queues(q, set)) goto err_flush_rq; - blk_mq_map_swqueue(q); - mutex_lock(&all_q_mutex); list_add_tail(&q->all_q_node, &all_q_list); mutex_unlock(&all_q_mutex); + blk_mq_add_queue_tag_set(set, q); + + blk_mq_map_swqueue(q); + return q; err_flush_rq: kfree(q->flush_rq); err_hw: - kfree(q->mq_map); -err_map: blk_cleanup_queue(q); err_hctxs: - for (i = 0; i < reg->nr_hw_queues; i++) { + kfree(map); + for (i = 0; i < set->nr_hw_queues; i++) { if (!hctxs[i]) break; - reg->ops->free_hctx(hctxs[i], i); + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); } +err_map: kfree(hctxs); err_percpu: free_percpu(ctx); @@ -1406,18 +1896,14 @@ EXPORT_SYMBOL(blk_mq_init_queue); void blk_mq_free_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_tag_set *set = q->tag_set; - queue_for_each_hw_ctx(q, hctx, i) { - kfree(hctx->ctx_map); - kfree(hctx->ctxs); - blk_mq_free_rq_map(hctx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - if (q->mq_ops->exit_hctx) - q->mq_ops->exit_hctx(hctx, i); - q->mq_ops->free_hctx(hctx, i); - } + blk_mq_del_queue_tag_set(q); + + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + blk_mq_free_hw_queues(q, set); + + percpu_counter_destroy(&q->mq_usage_counter); free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); @@ -1456,10 +1942,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, struct request_queue *q; /* - * Before new mapping is established, hotadded cpu might already start - * handling requests. This doesn't break anything as we map offline - * CPUs to first hardware queue. We will re-init queue below to get - * optimal settings. + * Before new mappings are established, hotadded cpu might already + * start handling requests. This doesn't break anything as we map + * offline CPUs to first hardware queue. We will re-init the queue + * below to get optimal settings. */ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) @@ -1472,6 +1958,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; } +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + if (!set->nr_hw_queues) + return -EINVAL; + if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH) + return -EINVAL; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) + return -EINVAL; + + if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) + return -EINVAL; + + + set->tags = kmalloc_node(set->nr_hw_queues * + sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!set->tags) + goto out; + + for (i = 0; i < set->nr_hw_queues; i++) { + set->tags[i] = blk_mq_init_rq_map(set, i); + if (!set->tags[i]) + goto out_unwind; + } + + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_rq_map(set, set->tags[i], i); +out: + return -ENOMEM; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + if (set->tags[i]) + blk_mq_free_rq_map(set, set->tags[i], i); + } + + kfree(set->tags); +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set || nr > set->queue_depth) + return -EINVAL; + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_tag_update_depth(hctx->tags, nr); + if (ret) + break; + } + + if (!ret) + q->nr_requests = nr; + + return ret; +} + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index ebbe6bac9d61..ff5e6bf0f691 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +struct blk_mq_tag_set; + struct blk_mq_ctx { struct { spinlock_t lock; @@ -9,7 +11,8 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int ipi_redirect; + + unsigned int last_tag ____cacheline_aligned_in_smp; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -20,21 +23,23 @@ struct blk_mq_ctx { struct request_queue *queue; struct kobject kobj; -}; +} ____cacheline_aligned_in_smp; void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); +void blk_mq_clone_flush_request(struct request *flush_rq, + struct request *orig_rq); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); /* * CPU hotplug helpers */ struct blk_mq_cpu_notifier; void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, - void (*fn)(void *, unsigned long, unsigned int), + int (*fn)(void *, unsigned long, unsigned int), void *data); void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); @@ -45,10 +50,17 @@ void blk_mq_disable_hotplug(void); /* * CPU -> queue mappings */ -struct blk_mq_reg; -extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg); +extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); -void blk_mq_add_timer(struct request *rq); +/* + * Basic implementation of sparser bitmap, allowing the user to spread + * the bits over more cachelines. + */ +struct blk_align_bitmap { + unsigned long word; + unsigned long depth; +} ____cacheline_aligned_in_smp; #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f876dae4..23321fbab293 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl; unsigned long nr; - int ret; + int ret, err; - if (!q->request_fn) + if (!q->request_fn && !q->mq_ops) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } + if (q->request_fn) + err = blk_update_nr_requests(q, nr); + else + err = blk_mq_update_nr_requests(q, nr); + + if (err) + return err; - spin_unlock_irq(q->queue_lock); return ret; } @@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - percpu_counter_destroy(&q->mq_usage_counter); - if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 033745cd7fba..9353b4683359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; + return false; return 1; } @@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, if (tg->io_disp[rw] + 1 <= io_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { if (wait) *wait = 0; - return 1; + return true; } /* Calc approx time to dispatch */ @@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (wait) *wait = 0; - return 1; + return true; } /* @@ -1258,7 +1258,7 @@ out_unlock: * of throtl_data->service_queue. Those bio's are ready and issued by this * function. */ -void blk_throtl_dispatch_work_fn(struct work_struct *work) +static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index d96f7061c6fd..43e8b515806f 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req) __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - if (q->mq_ops) - blk_mq_add_timer(req); - else - blk_add_timer(req); - + blk_add_timer(req); blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: @@ -170,7 +166,26 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); -void __blk_add_timer(struct request *req, struct list_head *timeout_list) +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; @@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; - if (timeout_list) - list_add_tail(&req->timeout_list, timeout_list); + if (!q->mq_ops) + list_add_tail(&req->timeout_list, &req->q->timeout_list); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = round_jiffies_up(req->deadline); + expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); if (!timer_pending(&q->timeout) || - time_before(expiry, q->timeout.expires)) - mod_timer(&q->timeout, expiry); + time_before(expiry, q->timeout.expires)) { + unsigned long diff = q->timeout.expires - expiry; -} + /* + * Due to added timer slack to group timers, the timer + * will often be a little in front of what we asked for. + * So apply some tolerance here too, otherwise we keep + * modifying the timer because expires for value X + * will be X + something. + */ + if (diff >= HZ / 2) + mod_timer(&q->timeout, expiry); + } -/** - * blk_add_timer - Start timeout timer for a single request - * @req: request that is about to start running. - * - * Notes: - * Each request has its own timer, and as it is added to the queue, we - * set up the timer. When the request completes, we cancel the timer. - */ -void blk_add_timer(struct request *req) -{ - __blk_add_timer(req, &req->q->timeout_list); } - diff --git a/block/blk.h b/block/blk.h index 1d880f1f957f..45385e9abf6f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,9 @@ /* Number of requests a "batching" process may submit */ #define BLK_BATCH_REQ 32 +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; @@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error, void blk_rq_timed_out_timer(unsigned long data); void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set); -void __blk_add_timer(struct request *req, struct list_head *timeout_list); +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); -void blk_add_timer(struct request *); bool bio_attempt_front_merge(struct request_queue *q, struct request *req, @@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +extern int blk_update_nr_requests(struct request_queue *, unsigned int); + /* * Contribute to IO statistics IFF: * diff --git a/mm/bounce.c b/block/bounce.c index 523918b8c6dc..523918b8c6dc 100644 --- a/mm/bounce.c +++ b/block/bounce.c diff --git a/block/bsg.c b/block/bsg.c index 420a5a9f1b23..e5214c148096 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -1008,7 +1008,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!q->request_fn) + if (!queue_is_rq_based(q)) return 0; bcd = &q->bsg_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e0985f1955e7..22dffebc7c73 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) { if (cfqd->busy_queues) { cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); + kblockd_schedule_work(&cfqd->unplug_work); } } @@ -4460,7 +4460,7 @@ out_free: static ssize_t cfq_var_show(unsigned int var, char *page) { - return sprintf(page, "%d\n", var); + return sprintf(page, "%u\n", var); } static ssize_t diff --git a/fs/ioprio.c b/block/ioprio.c index e50170ca7c33..e50170ca7c33 100644 --- a/fs/ioprio.c +++ b/block/ioprio.c diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 748dea4f34dc..758da2287d9a 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1406,7 +1406,7 @@ next_segment: track = block / (floppy->dtype->sects * floppy->type->sect_mult); sector = block % (floppy->dtype->sects * floppy->type->sect_mult); - data = rq->buffer + 512 * cnt; + data = bio_data(rq->bio) + 512 * cnt; #ifdef DEBUG printk("access to track %d, sector %d, with buffer at " "0x%08lx\n", track, sector, data); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 96b629e1f0c9..7e8a55f8917c 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1484,7 +1484,7 @@ repeat: ReqCnt = 0; ReqCmd = rq_data_dir(fd_request); ReqBlock = blk_rq_pos(fd_request); - ReqBuffer = fd_request->buffer; + ReqBuffer = bio_data(fd_request->bio); setup_req_params( drive ); do_fd_action( drive ); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 73894ca33956..4595c22f33f7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h) goto default_int_mode; if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { - err = pci_enable_msix(h->pdev, cciss_msix_entries, 4); + err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); if (!err) { h->intr[0] = cciss_msix_entries[0].vector; h->intr[1] = cciss_msix_entries[1].vector; @@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h) h->intr[3] = cciss_msix_entries[3].vector; h->msix_vector = 1; return; - } - if (err > 0) { - dev_warn(&h->pdev->dev, - "only %d MSI-X vectors available\n", err); } else { dev_warn(&h->pdev->dev, "MSI-X init failed %d\n", err); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 90ae4ba8f9ee..05a1780ffa85 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -29,7 +29,6 @@ #include <linux/drbd_limits.h> #include <linux/dynamic_debug.h> #include "drbd_int.h" -#include "drbd_wrappers.h" enum al_transaction_types { @@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd BUG_ON(!bdev->md_bdev); - drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", (void*)_RET_IP_ ); @@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval return _al_get(device, first, true); } -static bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size if (size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; - int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. - 200 times -> 20 seconds. */ + bool sa; retry: sig = wait_event_interruptible(device->al_wait, @@ -1035,12 +1032,15 @@ retry: if (test_bit(BME_LOCKED, &bm_ext->flags)) return 0; + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(device->al_wait, !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || - test_bit(BME_PRIORITY, &bm_ext->flags)); + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); - if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { spin_lock_irq(&device->al_lock); if (lc_put(device->resync, &bm_ext->lce) == 0) { bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ @@ -1052,9 +1052,6 @@ retry: return -EINTR; if (schedule_timeout_interruptible(HZ/10)) return -EINTR; - if (sa && --sa == 0) - drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec." - "Resync stalled?\n"); goto retry; } } @@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e7093d4291f1..a76ceb344d64 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -382,6 +382,12 @@ enum { __EE_CALL_AL_COMPLETE_IO, __EE_MAY_SET_IN_SYNC, + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + /* In case a barrier failed, * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, @@ -405,7 +411,9 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) @@ -579,6 +587,7 @@ struct drbd_resource { struct list_head resources; struct res_opts res_opts; struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ spinlock_t req_lock; unsigned susp:1; /* IO suspended by user */ @@ -609,6 +618,7 @@ struct drbd_connection { struct drbd_socket data; /* data/barrier/cstate/parameter packets */ struct drbd_socket meta; /* ping/ack (metadata) packets */ int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; unsigned long last_received; /* in jiffies, either socket */ unsigned int ko_count; @@ -814,6 +824,28 @@ struct drbd_device { struct submit_worker submit; }; +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + static inline struct drbd_device *minor_to_device(unsigned int minor) { return (struct drbd_device *)idr_find(&drbd_devices, minor); @@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor) static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) { - return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } #define for_each_resource(resource, _resources) \ @@ -1139,6 +1171,12 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_device *device); @@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern rwlock_t global_state_lock; extern int conn_lowest_minor(struct drbd_connection *connection); -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern void drbd_destroy_device(struct kref *kref); -extern void drbd_delete_device(struct drbd_device *mdev); +extern void drbd_delete_device(struct drbd_device *device); extern struct drbd_resource *drbd_create_resource(const char *name); extern void drbd_free_resource(struct drbd_resource *resource); @@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(const char *info); +extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); extern int drbd_worker(struct drbd_thread *thi); enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); void drbd_resync_after_changed(struct drbd_device *device); @@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); extern void start_resync_timer_fn(unsigned long data); +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_asender(struct drbd_thread *thi); -extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, const int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, + bool, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock) (char*)&val, sizeof(val)); } +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + device_to_minor(device)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); /* drbd_proc.c */ @@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); @@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device) static inline struct drbd_connection *first_connection(struct drbd_resource *resource) { - return list_first_entry(&resource->connections, + return list_first_entry_or_null(&resource->connections, struct drbd_connection, connections); } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 331e5cc1227d..960645c26e6f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } -/* Used to send write requests - * R_PRIMARY -> Peer (P_DATA) +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) */ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) { @@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * dp_flags |= DP_SEND_WRITE_ACK; } p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ if (dgs) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); @@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ... Be noisy about digest too large ... } */ } +out: mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ return err; @@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name) INIT_LIST_HEAD(&resource->connections); list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); spin_lock_init(&resource->req_lock); return resource; @@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device) return 0; } -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr) +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) { + struct drbd_resource *resource = adm_ctx->resource; struct drbd_connection *connection; struct drbd_device *device; struct drbd_peer_device *peer_device, *tmp_peer_device; struct gendisk *disk; struct request_queue *q; int id; + int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; device = minor_to_device(minor); @@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_no_minor_idr; } @@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_idr_remove_minor; } @@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); } goto out_idr_remove_from_resource; } @@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info("unable to create submit workqueue"); + drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 526414bc2cab..1b35c45c92b7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -34,7 +34,6 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" -#include "drbd_wrappers.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -82,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; -/* Configuration is strictly serialized, because generic netlink message - * processing is strictly serialized by the genl_lock(). - * Which means we can use one static global drbd_config_context struct. - */ -static struct drbd_config_context { - /* assigned from drbd_genlmsghdr */ - unsigned int minor; - /* assigned from request attributes, if present */ - unsigned int volume; -#define VOLUME_UNSPECIFIED (-1U) - /* pointer into the request skb, - * limited lifetime! */ - char *resource_name; - struct nlattr *my_addr; - struct nlattr *peer_addr; - - /* reply buffer */ - struct sk_buff *reply_skb; - /* pointer into reply buffer */ - struct drbd_genlmsghdr *reply_dh; - /* resolved from attributes, if possible */ - struct drbd_device *device; - struct drbd_resource *resource; - struct drbd_connection *connection; -} adm_ctx; - static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) { genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); @@ -117,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(const char *info) +int drbd_msg_put_info(struct sk_buff *skb, const char *info) { - struct sk_buff *skb = adm_ctx.reply_skb; struct nlattr *nla; int err = -EMSGSIZE; @@ -143,42 +115,46 @@ int drbd_msg_put_info(const char *info) * and per-family private info->pointers. * But we need to stay compatible with older kernels. * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. */ #define DRBD_ADM_NEED_MINOR 1 #define DRBD_ADM_NEED_RESOURCE 2 #define DRBD_ADM_NEED_CONNECTION 4 -static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, - unsigned flags) +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) { struct drbd_genlmsghdr *d_in = info->userhdr; const u8 cmd = info->genlhdr->cmd; int err; - memset(&adm_ctx, 0, sizeof(adm_ctx)); + memset(adm_ctx, 0, sizeof(*adm_ctx)); /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; - adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!adm_ctx.reply_skb) { + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, info, &drbd_genl_family, 0, cmd); /* put of a few bytes into a fresh skb of >= 4k will always succeed. * but anyways */ - if (!adm_ctx.reply_dh) { + if (!adm_ctx->reply_dh) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh->minor = d_in->minor; - adm_ctx.reply_dh->ret_code = NO_ERROR; + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; - adm_ctx.volume = VOLUME_UNSPECIFIED; + adm_ctx->volume = VOLUME_UNSPECIFIED; if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { struct nlattr *nla; /* parse and validate only */ @@ -188,111 +164,131 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, /* It was present, and valid, * copy it over to the reply skb. */ - err = nla_put_nohdr(adm_ctx.reply_skb, + err = nla_put_nohdr(adm_ctx->reply_skb, info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, info->attrs[DRBD_NLA_CFG_CONTEXT]); if (err) goto fail; - /* and assign stuff to the global adm_ctx */ + /* and assign stuff to the adm_ctx */ nla = nested_attr_tb[__nla_type(T_ctx_volume)]; if (nla) - adm_ctx.volume = nla_get_u32(nla); + adm_ctx->volume = nla_get_u32(nla); nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; if (nla) - adm_ctx.resource_name = nla_data(nla); - adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; - if ((adm_ctx.my_addr && - nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) || - (adm_ctx.peer_addr && - nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) { + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { err = -EINVAL; goto fail; } } - adm_ctx.minor = d_in->minor; - adm_ctx.device = minor_to_device(d_in->minor); - if (adm_ctx.resource_name) { - adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name); + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); } - if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) { - drbd_msg_put_info("unknown minor"); + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); return ERR_MINOR_INVALID; } - if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("unknown resource"); - if (adm_ctx.resource_name) + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) return ERR_RES_NOT_KNOWN; return ERR_INVALID_REQUEST; } if (flags & DRBD_ADM_NEED_CONNECTION) { - if (adm_ctx.resource) { - drbd_msg_put_info("no resource name expected"); + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device) { - drbd_msg_put_info("no minor number expected"); + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.my_addr && adm_ctx.peer_addr) - adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr), - nla_len(adm_ctx.my_addr), - nla_data(adm_ctx.peer_addr), - nla_len(adm_ctx.peer_addr)); - if (!adm_ctx.connection) { - drbd_msg_put_info("unknown connection"); + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); return ERR_INVALID_REQUEST; } } /* some more paranoia, if the request was over-determined */ - if (adm_ctx.device && adm_ctx.resource && - adm_ctx.device->resource != adm_ctx.resource) { + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", - adm_ctx.minor, adm_ctx.resource->name, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists in different resource"); + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device && - adm_ctx.volume != VOLUME_UNSPECIFIED && - adm_ctx.volume != adm_ctx.device->vnr) { + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx.minor, adm_ctx.volume, - adm_ctx.device->vnr, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists as different volume"); + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST; } + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + return NO_ERROR; fail: - nlmsg_free(adm_ctx.reply_skb); - adm_ctx.reply_skb = NULL; + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } -static int drbd_adm_finish(struct genl_info *info, int retcode) +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) { - if (adm_ctx.connection) { - kref_put(&adm_ctx.connection->kref, drbd_destroy_connection); - adm_ctx.connection = NULL; + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; } - if (adm_ctx.resource) { - kref_put(&adm_ctx.resource->kref, drbd_destroy_resource); - adm_ctx.resource = NULL; + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; } - if (!adm_ctx.reply_skb) + if (!adm_ctx->reply_skb) return -ENOMEM; - adm_ctx.reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx.reply_skb, info); + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); return 0; } @@ -426,6 +422,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + return fp; } @@ -438,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; + spin_lock_irq(&connection->resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); return false; } - spin_lock_irq(&connection->resource->req_lock); connect_cnt = connection->connect_cnt; spin_unlock_irq(&connection->resource->req_lock); @@ -654,11 +659,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) put_ldev(device); } } else { - mutex_lock(&device->resource->conf_update); + /* Called from drbd_adm_set_role only. + * We are still holding the conf_update mutex. */ nc = first_peer_device(device)->connection->net_conf; if (nc) nc->discard_my_data = 0; /* without copy; single bit op is atomic */ - mutex_unlock(&device->resource->conf_update); set_disk_ro(device->vdisk, false); if (get_ldev(device)) { @@ -700,11 +705,12 @@ static const char *from_attrs_err_to_txt(int err) int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct set_role_parms parms; int err; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -715,17 +721,22 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1104,15 +1115,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; + struct request_queue *b = NULL; if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + b = device->ldev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; rcu_read_unlock(); - put_ldev(device); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); } blk_queue_logical_block_size(q, 512); @@ -1121,8 +1135,25 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); - if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } blk_queue_stack_limits(q, b); @@ -1164,8 +1195,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ else peer = DRBD_MAX_BIO_SIZE; - } + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } new = min(local, peer); if (device->state.role == R_PRIMARY && new < now) @@ -1258,19 +1295,21 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; int err, fifo_size; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1294,7 +1333,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1385,12 +1424,15 @@ fail_unlock: success: put_ldev(device); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int err; enum drbd_ret_code retcode; @@ -1406,13 +1448,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); conn_reconfig_start(first_peer_device(device)->connection); /* if you want to reconfigure, please tear down first */ @@ -1455,7 +1498,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1619,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } if (device->state.conn < C_CONNECTED && - device->state.role == R_PRIMARY && + device->state.role == R_PRIMARY && device->ed_uuid && (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { drbd_err(device, "Can only attach to data with current UUID=%016llX\n", (unsigned long long)device->ed_uuid); @@ -1797,7 +1840,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(first_peer_device(device)->connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; force_diskless_dec: @@ -1819,9 +1863,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - + mutex_unlock(&adm_ctx.resource->adm_mutex); finish: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1860,11 +1904,12 @@ out: * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -1874,14 +1919,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2055,6 +2102,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2063,13 +2111,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); if (!new_net_conf) { @@ -2084,7 +2133,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info("net conf missing, try connect"); + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2096,7 +2145,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2167,12 +2216,15 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2182,14 +2234,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info("connection endpoint(s) missing"); + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2215,6 +2267,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } + mutex_lock(&adm_ctx.resource->adm_mutex); connection = first_connection(adm_ctx.resource); conn_reconfig_start(connection); @@ -2235,7 +2288,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2284,7 +2337,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail: @@ -2292,8 +2346,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2356,13 +2411,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2374,18 +2430,20 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } } + mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); if (rv < SS_SUCCESS) retcode = rv; /* FIXME: Type mismatch. */ else retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2407,6 +2465,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2417,12 +2476,13 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto fail; + goto finish; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; @@ -2436,7 +2496,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2482,7 +2542,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) goto fail_ldev; } - if (device->state.conn != C_CONNECTED) { + if (device->state.conn != C_CONNECTED && !rs.resize_force) { retcode = ERR_MD_LAYOUT_CONNECTED; goto fail_ldev; } @@ -2528,7 +2588,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail_ldev: @@ -2538,11 +2600,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2555,33 +2618,37 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } + mutex_lock(&adm_ctx.resource->adm_mutex); err = set_resource_options(adm_ctx.resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2605,26 +2672,29 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2639,15 +2709,17 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device) int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2674,40 +2746,45 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { s = adm_ctx.device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { @@ -2717,9 +2794,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2730,15 +2807,17 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { drbd_uuid_new_current(device); @@ -2753,9 +2832,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2931,10 +3010,11 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2946,7 +3026,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3133,11 +3213,12 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3154,17 +3235,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3179,10 +3261,12 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); + /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); device->ov_stop_sector = parms.ov_stop_sector; @@ -3193,21 +3277,24 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3219,11 +3306,12 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } + mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -3268,22 +3356,24 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); out_nolock: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static enum drbd_ret_code -drbd_check_resource_name(const char *name) +drbd_check_resource_name(struct drbd_config_context *adm_ctx) { + const char *name = adm_ctx->resource_name; if (!name || !name[0]) { - drbd_msg_put_info("resource name missing"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); return ERR_MANDATORY_TAG; } /* if we want to use these in sysfs/configfs/debugfs some day, * we must not allow slashes */ if (strchr(name, '/')) { - drbd_msg_put_info("invalid resource name"); + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); return ERR_INVALID_REQUEST; } return NO_ERROR; @@ -3291,11 +3381,12 @@ drbd_check_resource_name(const char *name) int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, 0); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3305,48 +3396,50 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(adm_ctx.resource_name); + retcode = drbd_check_resource_name(&adm_ctx); if (retcode != NO_ERROR) goto out; if (adm_ctx.resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info("resource exists"); + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; } + /* not yet safe for genl_family.parallel_ops */ if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_genlmsghdr *dh = info->userhdr; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info("requested minor out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info("requested volume id out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -3360,9 +3453,11 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) goto out; } - retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume); + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3383,35 +3478,40 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { struct drbd_peer_device *peer_device; @@ -3419,14 +3519,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to demote"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to disconnect"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); goto out; } } @@ -3435,7 +3535,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info("failed to detach"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); goto out; } } @@ -3453,7 +3553,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info("failed to delete volume"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); goto out; } } @@ -3462,25 +3562,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) synchronize_rcu(); drbd_free_resource(resource); retcode = NO_ERROR; - out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); for_each_connection(connection, resource) { if (connection->cstate > C_STANDALONE) { retcode = ERR_NET_CONFIGURED; @@ -3499,7 +3602,9 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) drbd_free_resource(resource); retcode = NO_ERROR; out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c index fa672b6df8d6..b2d4791498a6 100644 --- a/drivers/block/drbd/drbd_nla.c +++ b/drivers/block/drbd/drbd_nla.c @@ -1,4 +1,3 @@ -#include "drbd_wrappers.h" #include <linux/kernel.h> #include <net/netlink.h> #include <linux/drbd_genl_api.h> diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2f26e8ffa45b..89736bdbbc70 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se /* ------------------------ ~18s average ------------------------ */ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; dt = (jiffies - device->rs_mark_time[i]) / HZ; - if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + if (dt > 180) stalled = 1; if (!dt) diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 3c04ec0ea333..2da9104a3851 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -54,6 +54,11 @@ enum drbd_packet { P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -119,6 +124,11 @@ struct p_data { u32 dp_flags; } __packed; +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -150,6 +160,8 @@ struct p_block_req { * ReportParams */ +#define FF_TRIM 1 + struct p_connection_features { u32 protocol_min; u32 feature_flags; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 68e3992e8838..b6c8aaf4931b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -46,9 +46,10 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" - #include "drbd_vli.h" +#define PRO_FEATURES (FF_TRIM) + struct packet_info { enum drbd_packet cmd; unsigned int size; @@ -65,7 +66,7 @@ enum finish_epoch { static int drbd_do_features(struct drbd_connection *connection); static int drbd_do_auth(struct drbd_connection *connection); static int drbd_disconnected(struct drbd_peer_device *); - +static void conn_wait_active_ee_empty(struct drbd_connection *connection); static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); static int e_end_block(struct drbd_work *, int); @@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from - * the kernel, unless this allocation would exceed the max_buffers setting. + * the kernel. * Possibly retry until DRBD frees sufficient pages somewhere else. * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * * Returns a page chain linked via page->private. */ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, @@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int struct page *page = NULL; struct net_conf *nc; DEFINE_WAIT(wait); - int mxb; + unsigned int mxb; - /* Yes, we may run up to @number over max_buffers. If we - * follow it strictly, the admin will get it wrong anyways. */ rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); mxb = nc ? nc->max_buffers : 1000000; @@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int break; } - schedule(); + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; } finish_wait(&drbd_pp_wait, &wait); @@ -331,7 +339,7 @@ You must not have the req_lock: struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, gfp_t gfp_mask) __must_hold(local) + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; @@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (data_size) { + if (has_payload && data_size) { page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); if (!page) goto fail; @@ -1026,24 +1034,27 @@ randomize: if (drbd_send_protocol(connection) == -EOPNOTSUPP) return -1; + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + set_bit(STATE_SENT, &connection->flags); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; kref_get(&device->kref); rcu_read_unlock(); - /* Prevent a race between resync-handshake and - * being promoted to Primary. - * - * Grab and release the state mutex, so we know that any current - * drbd_set_role() is finished, and any incoming drbd_set_role - * will see the STATE_SENT flag, and wait for it to be cleared. - */ - mutex_lock(device->state_mutex); - mutex_unlock(device->state_mutex); - if (discard_my_data) set_bit(DISCARD_MY_DATA, &device->flags); else @@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, ds >> 9, GFP_NOIO)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 0; /* discards don't have any payload. */ + /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device, next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed\n"); + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); goto fail; } /* > peer_req->i.sector, unless this is the first bio */ @@ -1340,6 +1365,11 @@ next_bio: bios = bio; ++n_bios; + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = ds; + goto submit; + } + page_chain_for_each(page) { unsigned len = min_t(unsigned, ds, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1360,8 +1390,9 @@ next_bio: sector += len >> 9; --nr_pages; } - D_ASSERT(device, page == NULL); D_ASSERT(device, ds == 0); +submit: + D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); do { @@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * and from receive_Data */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - int data_size) __must_hold(local) + struct packet_info *pi) __must_hold(local) { struct drbd_device *device = peer_device->device; const sector_t capacity = drbd_get_capacity(device->this_bdev); struct drbd_peer_request *peer_req; struct page *page; int dgs, ds, err; + int data_size = pi->size; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; dgs = 0; - if (peer_device->connection->peer_integrity_tfm) { + if (!trim && peer_device->connection->peer_integrity_tfm) { dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer @@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= dgs; } + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + if (!expect(IS_ALIGNED(data_size, 512))) return NULL; - if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); if (!peer_req) return NULL; - if (!data_size) + if (trim) return peer_req; ds = data_size; @@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused) } static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, - int data_size) __releases(local) + struct packet_info *pi) __releases(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; - peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size); + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); if (!peer_req) goto fail; @@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto list_add(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); - atomic_add(data_size >> 9, &device->rs_sect_ev); + atomic_add(pi->size >> 9, &device->rs_sect_ev); if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, * or in drbd_peer_request_endio. */ - err = recv_resync_read(peer_device, sector, pi->size); + err = recv_resync_read(peer_device, sector, pi); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Can not write resync data to local disk.\n"); @@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * */ sector = be64_to_cpu(p->sector); - peer_req = read_in_block(peer_device, p->block_id, sector, pi->size); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); if (!peer_req) { put_ldev(device); return -EIO; @@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(dp_flags); - if (peer_req->pages == NULL) { + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); } @@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - list_add(&peer_req->w.list, &device->active_ee); + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (device->state.conn == C_SYNC_TARGET) @@ -2313,39 +2365,45 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) { - struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; - unsigned long db, dt, dbdt; struct lc_element *tmp; - int curr_events; - int throttle = 0; - unsigned int c_min_rate; - - rcu_read_lock(); - c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; - rcu_read_unlock(); + bool throttle = true; - /* feature disabled? */ - if (c_min_rate == 0) - return 0; + if (!drbd_rs_c_min_rate_throttle(device)) + return false; spin_lock_irq(&device->al_lock); tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); if (tmp) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_PRIORITY, &bm_ext->flags)) { - spin_unlock_irq(&device->al_lock); - return 0; - } + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; /* Do not slow down if app IO is already waiting for this extent */ } spin_unlock_irq(&device->al_lock); + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + (int)part_stat_read(&disk->part0, sectors[1]) - atomic_read(&device->rs_sect_ev); - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) dbdt = Bit2KB(db/dt); if (dbdt > c_min_rate) - throttle = 1; + return true; } - return throttle; + return false; } - static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) { struct drbd_peer_device *peer_device; @@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info put_ldev(device); } + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(device); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { dd = drbd_determine_dev_size(device, ddsf, NULL); @@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info drbd_set_my_capacity(device, p_size); } - device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - drbd_reconsider_max_bio_size(device); - if (get_ldev(device)) { if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); @@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection) memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); } @@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection) goto incompat; connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + return 1; incompat: @@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + peers_ch = kmalloc(pi.size, GFP_NOIO); if (peers_ch == NULL) { drbd_err(connection, "kmalloc of peers_ch failed\n"); @@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3779c8d2875b..09803d0d5207 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + case QUEUE_FOR_NET_READ: /* READ or READA, and * no local disk, @@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws) if (list_empty(&incoming)) break; +skip_fast_path: wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); /* Maybe more was queued, while we prepared the transaction? * Try to stuff them into this transaction as well. @@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws) list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + + /* If all currently hot activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to cold extents. In that case, prepare one request + * in blocking mode. */ + list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { + list_del_init(&req->tl_requests); + req->rq_state |= RQ_IN_ACT_LOG; + if (!drbd_al_begin_io_prepare(device, &req->i)) { + /* Corresponding extent was hot after all? */ + drbd_send_and_submit(device, req); + } else { + /* Found a request to a cold extent. + * Put on "pending" list, + * and try to cumulate with more. */ + list_add(&req->tl_requests, &pending); + goto skip_fast_path; + } + } } } @@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -static struct drbd_request *find_oldest_request(struct drbd_connection *connection) +static void find_oldest_requests( + struct drbd_connection *connection, + struct drbd_device *device, + struct drbd_request **oldest_req_waiting_for_peer, + struct drbd_request **oldest_req_waiting_for_disk) { - /* Walk the transfer log, - * and find the oldest not yet completed request */ struct drbd_request *r; + *oldest_req_waiting_for_peer = NULL; + *oldest_req_waiting_for_disk = NULL; list_for_each_entry(r, &connection->transfer_log, tl_requests) { - if (atomic_read(&r->completion_ref)) - return r; + const unsigned s = r->rq_state; + if (!*oldest_req_waiting_for_peer + && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) + *oldest_req_waiting_for_peer = r; + + if (!*oldest_req_waiting_for_disk + && (s & RQ_LOCAL_PENDING) && r->device == device) + *oldest_req_waiting_for_disk = r; + + if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) + break; } - return NULL; } void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; struct drbd_connection *connection = first_peer_device(device)->connection; - struct drbd_request *req; /* oldest request */ + struct drbd_request *req_disk, *req_peer; /* oldest request */ struct net_conf *nc; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data) now = jiffies; spin_lock_irq(&device->resource->req_lock); - req = find_oldest_request(connection); - if (!req) { + find_oldest_requests(connection, device, &req_peer, &req_disk); + if (req_peer == NULL && req_disk == NULL) { spin_unlock_irq(&device->resource->req_lock); mod_timer(&device->request_timer, now + et); return; @@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data) * ~198 days with 250 HZ, we have a window where the timeout would need * to expire twice (worst case) to become effective. Good enough. */ - if (ent && req->rq_state & RQ_NET_PENDING && - time_after(now, req->start_time + ent) && + if (ent && req_peer && + time_after(now, req_peer->start_time + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device && - time_after(now, req->start_time + dt) && + if (dt && req_disk && + time_after(now, req_disk->start_time + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(device, DRBD_FORCE_DETACH); } - nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) + ? req_peer->start_time + ent : now + et; + dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) + ? req_disk->start_time + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; spin_unlock_irq(&connection->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c684c963538e..8566cd5866b4 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -30,7 +30,6 @@ #include <linux/slab.h> #include <linux/drbd.h> #include "drbd_int.h" -#include "drbd_wrappers.h" /* The request callbacks will be called in irq context by the IDE drivers, and in Softirqs/Tasklets/BH context by the SCSI drivers, @@ -111,11 +110,14 @@ enum drbd_req_event { BARRIER_ACKED, /* in protocol A and B */ DATA_RECEIVED, /* (remote read) */ + COMPLETED_OK, READ_COMPLETED_WITH_ERROR, READ_AHEAD_COMPLETED_WITH_ERROR, WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + ABORT_DISK_IO, - COMPLETED_OK, RESEND, FAIL_FROZEN_DISK_IO, RESTART_FROZEN_DISK_IO, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 1a84345a3868..a5d8aae00e04 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); static inline bool is_susp(union drbd_state s) { @@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv >= SS_SUCCESS) rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ @@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv < SS_SUCCESS) { spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st * When we loose connection, we have to set the state of the peers disk (pdsk) * to D_UNKNOWN. This rule and many more along those lines are in this function. */ -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn) +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; @@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st } if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { @@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, os = drbd_read_state(device); - ns = sanitize_state(device, ns, &ssw); + ns = sanitize_state(device, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union number_of_volumes++; os = drbd_read_state(device); ns = apply_mask_val(os, mask, val); - ns = sanitize_state(device, ns, NULL); + ns = sanitize_state(device, os, ns, NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union static enum drbd_state_rv _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) { - enum drbd_state_rv rv; + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) - return SS_CW_SUCCESS; + rv = SS_CW_SUCCESS; if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) - return SS_CW_FAILED_BY_PEER; + rv = SS_CW_FAILED_BY_PEER; - rv = conn_is_valid_transition(connection, mask, val, 0); - if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) - rv = SS_UNKNOWN_ERROR; /* continue waiting */ + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; - return rv; + return err; } enum drbd_state_rv diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2c4ce42c3657..d8f57b6305cd 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ -static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; @@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error) struct drbd_device *device = peer_req->peer_device->device; int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); if (error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", - is_write ? "write" : "read", error, + is_write ? (is_discard ? "discard" : "write") + : "read", error, (unsigned long long)peer_req->i.sector); if (!error && !uptodate) { if (__ratelimit(&drbd_ratelimit_state)) @@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error) /* to avoid recursion in __req_mod */ if (unlikely(error)) { - what = (bio_data_dir(bio) == WRITE) + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) ? WRITE_COMPLETED_WITH_ERROR : (bio_rw(bio) == READ) ? READ_COMPLETED_WITH_ERROR @@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, GFP_TRY); + size, true /* has real payload */, GFP_TRY); if (!peer_req) goto defer; @@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device) +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) { struct disk_conf *dc; - unsigned int sect_in; /* Number of sectors that came in since the last turn */ unsigned int want; /* The number of sectors we want in the proxy */ int req_sect; /* Number of sectors to request in this turn */ int correction; /* Number of sectors more we need in the proxy*/ @@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device) int max_sect; struct fifo_buffer *plan; - sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */ - device->rs_in_flight -= sect_in; - dc = rcu_dereference(device->ldev->disk_conf); plan = rcu_dereference(device->rs_plan_s); @@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device) static int drbd_rs_number_requests(struct drbd_device *device) { - int number; + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device) } rcu_read_unlock(); - /* ignore the amount of pending requests, the resync controller should - * throttle down to incoming reply rate soon enough anyways. */ + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + if (mxb - device->rs_in_flight < number) + number = mxb - device->rs_in_flight; + return number; } @@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel) max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); - if (number == 0) + if (number <= 0) goto requeue; for (i = 0; i < number; i++) { @@ -647,7 +663,7 @@ next_sector: */ align = 1; rollback_i = i; - for (;;) { + while (i < number) { if (size + BM_BLOCK_SIZE > max_bio_size) break; @@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } clear_bit(B_RS_H_DONE, &device->flags); - write_lock_irq(&global_state_lock); + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); mutex_unlock(device->state_mutex); return; } @@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } _drbd_pause_after(device); } - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); if (r == SS_SUCCESS) { /* reset rs_last_bcast when a resync or verify is started, @@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) mutex_unlock(device->state_mutex); } -/* If the resource already closed the current epoch, but we did not - * (because we have not yet seen new requests), we should send the - * corresponding barrier now. Must be checked within the same spinlock - * that is used to check for new requests. */ -static bool need_to_send_barrier(struct drbd_connection *connection) -{ - if (!connection->send.seen_any_write_yet) - return false; - - /* Skip barriers that do not contain any writes. - * This may happen during AHEAD mode. */ - if (!connection->send.current_epoch_writes) - return false; - - /* ->req_lock is held when requests are queued on - * connection->sender_work, and put into ->transfer_log. - * It is also held when ->current_tle_nr is increased. - * So either there are already new requests queued, - * and corresponding barriers will be send there. - * Or nothing new is queued yet, so the difference will be 1. - */ - if (atomic_read(&connection->current_tle_nr) != - connection->send.current_epoch_nr + 1) - return false; - - return true; -} - static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) { spin_lock_irq(&queue->q_lock); @@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * spin_unlock_irq(&connection->resource->req_lock); break; } - send_barrier = need_to_send_barrier(connection); + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; spin_unlock_irq(&connection->resource->req_lock); - if (send_barrier) { - drbd_send_barrier(connection); - connection->send.current_epoch_nr++; - } + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); schedule(); /* may be woken up for other things but new work, too, * e.g. if the current epoch got closed. diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h deleted file mode 100644 index 3db9ebaf64f6..000000000000 --- a/drivers/block/drbd/drbd_wrappers.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _DRBD_WRAPPERS_H -#define _DRBD_WRAPPERS_H - -#include <linux/ctype.h> -#include <linux/mm.h> -#include "drbd_int.h" - -/* see get_sb_bdev and bd_claim */ -extern char *drbd_sec_holder; - -/* sets the number of 512 byte sectors of our virtual device */ -static inline void drbd_set_my_capacity(struct drbd_device *device, - sector_t size) -{ - /* set_capacity(device->this_bdev->bd_disk, size); */ - set_capacity(device->vdisk, size); - device->this_bdev->bd_inode->i_size = (loff_t)size << 9; -} - -#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) - -/* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); - -/* - * used to submit our private bio - */ -static inline void drbd_generic_make_request(struct drbd_device *device, - int fault_type, struct bio *bio) -{ - __release(local); - if (!bio->bi_bdev) { - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " - "bio->bi_bdev == NULL\n", - device_to_minor(device)); - dump_stack(); - bio_endio(bio, -ENODEV); - return; - } - - if (drbd_insert_fault(device, fault_type)) - bio_endio(bio, -EIO); - else - generic_make_request(bio); -} - -#ifndef __CHECKER__ -# undef __cond_lock -# define __cond_lock(x,c) (c) -#endif - -#endif diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8f5565bf34cd..8e767bb7995e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2351,7 +2351,7 @@ static void rw_interrupt(void) } if (CT(COMMAND) != FD_READ || - raw_cmd->kernel_data == current_req->buffer) { + raw_cmd->kernel_data == bio_data(current_req->bio)) { /* transfer directly from buffer */ cont->done(1); } else if (CT(COMMAND) == FD_READ) { @@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void) raw_cmd->flags &= ~FD_RAW_WRITE; raw_cmd->flags |= FD_RAW_READ; COMMAND = FM_MODE(_floppy, FD_READ); - } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) { + } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) { unsigned long dma_limit; int direct, indirect; @@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void) */ max_size = buffer_chain_size(); dma_limit = (MAX_DMA_ADDRESS - - ((unsigned long)current_req->buffer)) >> 9; + ((unsigned long)bio_data(current_req->bio))) >> 9; if ((unsigned long)max_size > dma_limit) max_size = dma_limit; /* 64 kb boundaries */ - if (CROSS_64KB(current_req->buffer, max_size << 9)) + if (CROSS_64KB(bio_data(current_req->bio), max_size << 9)) max_size = (K_64 - - ((unsigned long)current_req->buffer) % + ((unsigned long)bio_data(current_req->bio)) % K_64) >> 9; direct = transfer_size(ssize, max_sector, max_size) - fsector_t; /* @@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void) (DP->read_track & (1 << DRS->probed_format)))))) { max_size = blk_rq_sectors(current_req); } else { - raw_cmd->kernel_data = current_req->buffer; + raw_cmd->kernel_data = bio_data(current_req->bio); raw_cmd->length = current_count_sectors << 9; if (raw_cmd->length == 0) { DPRINT("%s: zero dma transfer attempted\n", __func__); @@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void) raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; raw_cmd->length <<= 9; if ((raw_cmd->length < current_count_sectors << 9) || - (raw_cmd->kernel_data != current_req->buffer && + (raw_cmd->kernel_data != bio_data(current_req->bio) && CT(COMMAND) == FD_WRITE && (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || aligned_sector_t < buffer_min)) || @@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void) raw_cmd->length <= 0 || current_count_sectors <= 0) { DPRINT("fractionary current count b=%lx s=%lx\n", raw_cmd->length, current_count_sectors); - if (raw_cmd->kernel_data != current_req->buffer) + if (raw_cmd->kernel_data != bio_data(current_req->bio)) pr_info("addr=%d, length=%ld\n", (int)((raw_cmd->kernel_data - floppy_track_buffer) >> 9), @@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void) return 0; } - if (raw_cmd->kernel_data != current_req->buffer) { + if (raw_cmd->kernel_data != bio_data(current_req->bio)) { if (raw_cmd->kernel_data < floppy_track_buffer || current_count_sectors < 0 || raw_cmd->length < 0 || @@ -3809,7 +3809,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) bio.bi_iter.bi_size = size; bio.bi_bdev = bdev; bio.bi_iter.bi_sector = 0; - bio.bi_flags = (1 << BIO_QUIET); + bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index bf397bf108b7..8a290c08262f 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -464,11 +464,11 @@ static void read_intr(void) ok_to_read: req = hd_req; - insw(HD_DATA, req->buffer, 256); + insw(HD_DATA, bio_data(req->bio), 256); #ifdef DEBUG printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", req->rq_disk->disk_name, blk_rq_pos(req) + 1, - blk_rq_sectors(req) - 1, req->buffer+512); + blk_rq_sectors(req) - 1, bio_data(req->bio)+512); #endif if (hd_end_request(0, 512)) { SET_HANDLER(&read_intr); @@ -505,7 +505,7 @@ static void write_intr(void) ok_to_write: if (hd_end_request(0, 512)) { SET_HANDLER(&write_intr); - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); return; } @@ -624,7 +624,7 @@ repeat: printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", req->rq_disk->disk_name, req_data_dir(req) == READ ? "read" : "writ", - cyl, head, sec, nsect, req->buffer); + cyl, head, sec, nsect, bio_data(req->bio)); #endif if (req->cmd_type == REQ_TYPE_FS) { switch (rq_data_dir(req)) { @@ -643,7 +643,7 @@ repeat: bad_rw_intr(); goto repeat; } - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); break; default: printk("unknown hd-command\n"); diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index eb59b1241366..e352cac707e8 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host, static void mg_read_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -496,7 +496,7 @@ static void mg_read(struct request *req) mg_bad_rw_intr(host); MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - blk_rq_sectors(req), blk_rq_pos(req), req->buffer); + blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio)); do { if (mg_wait(host, ATA_DRQ, @@ -514,7 +514,7 @@ static void mg_read(struct request *req) static void mg_write_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -534,7 +534,7 @@ static void mg_write(struct request *req) } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - rem, blk_rq_pos(req), req->buffer); + rem, blk_rq_pos(req), bio_data(req->bio)); if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { @@ -585,7 +585,7 @@ ok_to_read: mg_read_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); + blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio)); /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); @@ -624,7 +624,7 @@ ok_to_write: /* write 1 sector and set handler if remains */ mg_write_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req), req->buffer); + blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio)); host->mg_do_intr = mg_write_intr; mod_timer(&host->timer, jiffies + 3 * HZ); } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 59c5abe32f06..74abd49fabdc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bio.h> #include <linux/dma-mapping.h> #include <linux/idr.h> @@ -173,60 +174,36 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) return false; /* device present */ } -/* - * Obtain an empty command slot. - * - * This function needs to be reentrant since it could be called - * at the same time on multiple CPUs. The allocation of the - * command slot must be atomic. - * - * @port Pointer to the port data structure. - * - * return value - * >= 0 Index of command slot obtained. - * -1 No command slots available. - */ -static int get_slot(struct mtip_port *port) +static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { - int slot, i; - unsigned int num_command_slots = port->dd->slot_groups * 32; + struct request *rq; - /* - * Try 10 times, because there is a small race here. - * that's ok, because it's still cheaper than a lock. - * - * Race: Since this section is not protected by lock, same bit - * could be chosen by different process contexts running in - * different processor. So instead of costly lock, we are going - * with loop. - */ - for (i = 0; i < 10; i++) { - slot = find_next_zero_bit(port->allocated, - num_command_slots, 1); - if ((slot < num_command_slots) && - (!test_and_set_bit(slot, port->allocated))) - return slot; - } - dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + return blk_mq_rq_to_pdu(rq); +} - mtip_check_surprise_removal(port->dd->pdev); - return -1; +static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) +{ + blk_put_request(blk_mq_rq_from_pdu(cmd)); } /* - * Release a command slot. - * - * @port Pointer to the port data structure. - * @tag Tag of command to release - * - * return value - * None + * Once we add support for one hctx per mtip group, this will change a bit */ -static inline void release_slot(struct mtip_port *port, int tag) +static struct request *mtip_rq_from_tag(struct driver_data *dd, + unsigned int tag) +{ + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; + + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, + unsigned int tag) { - smp_mb__before_clear_bit(); - clear_bit(tag, port->allocated); - smp_mb__after_clear_bit(); + struct request *rq = mtip_rq_from_tag(dd, tag); + + return blk_mq_rq_to_pdu(rq); } /* @@ -248,93 +225,28 @@ static inline void release_slot(struct mtip_port *port, int tag) * None */ static void mtip_async_complete(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *cmd, int status) { - struct mtip_cmd *cmd; - struct driver_data *dd = data; - int unaligned, cb_status = status ? -EIO : 0; - void (*func)(void *, int); + struct driver_data *dd = port->dd; + struct request *rq; if (unlikely(!dd) || unlikely(!port)) return; - cmd = &port->commands[tag]; - if (unlikely(status == PORT_IRQ_TF_ERR)) { dev_warn(&port->dd->pdev->dev, "Command tag %d failed due to TFE\n", tag); } - /* Clear the active flag */ - atomic_set(&port->commands[tag].active, 0); - - /* Upper layer callback */ - func = cmd->async_callback; - if (likely(func && cmpxchg(&cmd->async_callback, func, 0) == func)) { + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction); - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); + rq = mtip_rq_from_tag(dd, tag); - func(cmd->async_data, cb_status); - unaligned = cmd->unaligned; + if (unlikely(cmd->unaligned)) + up(&port->cmd_slot_unal); - /* Clear the allocated bit for the command */ - release_slot(port, tag); - - if (unlikely(unaligned)) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } -} - -/* - * This function is called for clean the pending command in the - * command slot during the surprise removal of device and return - * error to the upper layer. - * - * @dd Pointer to the DRIVER_DATA structure. - * - * return value - * None - */ -static void mtip_command_cleanup(struct driver_data *dd) -{ - int tag = 0; - struct mtip_cmd *cmd; - struct mtip_port *port = dd->port; - unsigned int num_cmd_slots = dd->slot_groups * 32; - - if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - return; - - if (!port) - return; - - cmd = &port->commands[MTIP_TAG_INTERNAL]; - if (atomic_read(&cmd->active)) - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & - (1 << MTIP_TAG_INTERNAL)) - if (cmd->comp_func) - cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, -ENODEV); - - while (1) { - tag = find_next_bit(port->allocated, num_cmd_slots, tag); - if (tag >= num_cmd_slots) - break; - - cmd = &port->commands[tag]; - if (atomic_read(&cmd->active)) - mtip_async_complete(port, tag, dd, -ENODEV); - } - - set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); + blk_mq_end_io(rq, status ? -EIO : 0); } /* @@ -388,8 +300,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { int group = tag >> 5; - atomic_set(&port->commands[tag].active, 1); - /* guard SACT and CI registers */ spin_lock(&port->cmd_issue_lock[group]); writel((1 << MTIP_TAG_BIT(tag)), @@ -397,10 +307,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); spin_unlock(&port->cmd_issue_lock[group]); - - /* Set the command's timeout value.*/ - port->commands[tag].comp_time = jiffies + msecs_to_jiffies( - MTIP_NCQ_COMMAND_TIMEOUT_MS); } /* @@ -648,132 +554,13 @@ static void print_tags(struct driver_data *dd, memset(tagmap, 0, sizeof(tagmap)); for (group = SLOTBITS_IN_LONGS; group > 0; group--) - tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", + tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ", tagbits[group-1]); dev_warn(&dd->pdev->dev, "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); } /* - * Called periodically to see if any read/write commands are - * taking too long to complete. - * - * @data Pointer to the PORT data structure. - * - * return value - * None - */ -static void mtip_timeout_function(unsigned long int data) -{ - struct mtip_port *port = (struct mtip_port *) data; - struct host_to_dev_fis *fis; - struct mtip_cmd *cmd; - int unaligned, tag, cmdto_cnt = 0; - unsigned int bit, group; - unsigned int num_command_slots; - unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; - void (*func)(void *, int); - - if (unlikely(!port)) - return; - - if (unlikely(port->dd->sr)) - return; - - if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(30000)); - return; - } - /* clear the tag accumulator */ - memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); - num_command_slots = port->dd->slot_groups * 32; - - for (tag = 0; tag < num_command_slots; tag++) { - /* - * Skip internal command slot as it has - * its own timeout mechanism - */ - if (tag == MTIP_TAG_INTERNAL) - continue; - - if (atomic_read(&port->commands[tag].active) && - (time_after(jiffies, port->commands[tag].comp_time))) { - group = tag >> 5; - bit = tag & 0x1F; - - cmd = &port->commands[tag]; - fis = (struct host_to_dev_fis *) cmd->command; - - set_bit(tag, tagaccum); - cmdto_cnt++; - if (cmdto_cnt == 1) - set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - - /* - * Clear the completed bit. This should prevent - * any interrupt handlers from trying to retire - * the command. - */ - writel(1 << bit, port->completed[group]); - - /* Clear the active flag for the command */ - atomic_set(&port->commands[tag].active, 0); - - func = cmd->async_callback; - if (func && - cmpxchg(&cmd->async_callback, func, 0) == func) { - - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&port->dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); - - func(cmd->async_data, -EIO); - unaligned = cmd->unaligned; - - /* Clear the allocated bit for the command. */ - release_slot(port, tag); - - if (unaligned) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } - } - } - - if (cmdto_cnt) { - print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - mtip_device_reset(port->dd); - wake_up_interruptible(&port->svc_wait); - } - clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - } - - if (port->ic_pause_timer) { - to = port->ic_pause_timer + msecs_to_jiffies(1000); - if (time_after(jiffies, to)) { - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - port->ic_pause_timer = 0; - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); - } - - - } - } - - /* Restart the timer */ - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); -} - -/* * Internal command completion callback function. * * This function is normally called by the driver ISR when an internal @@ -789,28 +576,19 @@ static void mtip_timeout_function(unsigned long int data) * None */ static void mtip_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - struct mtip_cmd *command = &port->commands[tag]; - struct completion *waiting = data; + struct completion *waiting = command->comp_data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - return; } static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, @@ -842,19 +620,16 @@ static void mtip_handle_tfe(struct driver_data *dd) port = dd->port; - /* Stop the timer to prevent command timeouts. */ - del_timer(&port->cmd_timer); set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && test_bit(MTIP_TAG_INTERNAL, port->allocated)) { - cmd = &port->commands[MTIP_TAG_INTERNAL]; + cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd, PORT_IRQ_TF_ERR); } goto handle_tfe_exit; } @@ -866,6 +641,8 @@ static void mtip_handle_tfe(struct driver_data *dd) for (group = 0; group < dd->slot_groups; group++) { completed = readl(port->completed[group]); + dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed); + /* clear completed status register in the hardware.*/ writel(completed, port->completed[group]); @@ -879,15 +656,11 @@ static void mtip_handle_tfe(struct driver_data *dd) if (tag == MTIP_TAG_INTERNAL) continue; - cmd = &port->commands[tag]; + cmd = mtip_cmd_from_tag(dd, tag); if (likely(cmd->comp_func)) { set_bit(tag, tagaccum); cmd_cnt++; - atomic_set(&cmd->active, 0); - cmd->comp_func(port, - tag, - cmd->comp_data, - 0); + cmd->comp_func(port, tag, cmd, 0); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -947,11 +720,7 @@ static void mtip_handle_tfe(struct driver_data *dd) for (bit = 0; bit < 32; bit++) { reissue = 1; tag = (group << 5) + bit; - cmd = &port->commands[tag]; - - /* If the active bit is set re-issue the command */ - if (atomic_read(&cmd->active) == 0) - continue; + cmd = mtip_cmd_from_tag(dd, tag); fis = (struct host_to_dev_fis *)cmd->command; @@ -970,11 +739,9 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - atomic_set(&cmd->active, 0); if (cmd->comp_func) { cmd->comp_func(port, tag, - cmd->comp_data, - -ENODATA); + cmd, -ENODATA); } continue; } @@ -997,14 +764,9 @@ static void mtip_handle_tfe(struct driver_data *dd) /* Retire a command that will not be reissued */ dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - atomic_set(&cmd->active, 0); if (cmd->comp_func) - cmd->comp_func( - port, - tag, - cmd->comp_data, - PORT_IRQ_TF_ERR); + cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", @@ -1017,9 +779,6 @@ handle_tfe_exit: /* clear eh_active */ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); - - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); } /* @@ -1048,15 +807,10 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (unlikely(tag == MTIP_TAG_INTERNAL)) continue; - command = &port->commands[tag]; - /* make internal callback */ - if (likely(command->comp_func)) { - command->comp_func( - port, - tag, - command->comp_data, - 0); - } else { + command = mtip_cmd_from_tag(dd, tag); + if (likely(command->comp_func)) + command->comp_func(port, tag, command, 0); + else { dev_dbg(&dd->pdev->dev, "Null completion for tag %d", tag); @@ -1081,16 +835,13 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) { struct mtip_port *port = dd->port; - struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; + struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL))) { if (cmd->comp_func) { - cmd->comp_func(port, - MTIP_TAG_INTERNAL, - cmd->comp_data, - 0); + cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0); return; } } @@ -1103,8 +854,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) */ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) { - if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) - mtip_handle_tfe(dd); if (unlikely(port_stat & PORT_IRQ_CONNECT)) { dev_warn(&dd->pdev->dev, @@ -1122,6 +871,12 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) dev_warn(&dd->pdev->dev, "Port stat errors %x unhandled\n", (port_stat & ~PORT_IRQ_HANDLED)); + if (mtip_check_surprise_removal(dd->pdev)) + return; + } + if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { + set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); } } @@ -1222,7 +977,6 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance) static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) { - atomic_set(&port->commands[tag].active, 1); writel(1 << MTIP_TAG_BIT(tag), port->cmd_issue[MTIP_TAG_INDEX(tag)]); } @@ -1280,6 +1034,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) unsigned int n; unsigned int active = 1; + blk_mq_stop_hw_queues(port->dd->queue); + to = jiffies + msecs_to_jiffies(timeout); do { if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && @@ -1287,8 +1043,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) msleep(20); continue; /* svc thd is actively issuing commands */ } + + msleep(100); + if (mtip_check_surprise_removal(port->dd->pdev)) + goto err_fault; if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) - return -EFAULT; + goto err_fault; + /* * Ignore s_active bit 0 of array element 0. * This bit will always be set @@ -1299,11 +1060,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) if (!active) break; - - msleep(20); } while (time_before(jiffies, to)); + blk_mq_start_stopped_hw_queues(port->dd->queue, true); return active ? -EBUSY : 0; +err_fault: + blk_mq_start_stopped_hw_queues(port->dd->queue, true); + return -EFAULT; } /* @@ -1335,10 +1098,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, { struct mtip_cmd_sg *command_sg; DECLARE_COMPLETION_ONSTACK(wait); - int rv = 0, ready2go = 1; - struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; - unsigned long to; + struct mtip_cmd *int_cmd; struct driver_data *dd = port->dd; + int rv = 0; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1346,19 +1108,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - to = jiffies + msecs_to_jiffies(timeout); - do { - ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL, - port->allocated); - if (ready2go) - break; - mdelay(100); - } while (time_before(jiffies, to)); - if (!ready2go) { - dev_warn(&dd->pdev->dev, - "Internal cmd active. new cmd [%02X]\n", fis->command); - return -EBUSY; - } + int_cmd = mtip_get_int_command(dd); + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); port->ic_pause_timer = 0; @@ -1368,10 +1119,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { if (fis->command != ATA_CMD_STANDBYNOW1) { /* wait for io to complete if non atomic */ - if (mtip_quiesce_io(port, 5000) < 0) { + if (mtip_quiesce_io(port, + MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return -EBUSY; @@ -1416,9 +1168,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { /* Wait for the command to complete or timeout. */ - if (wait_for_completion_interruptible_timeout( + if ((rv = wait_for_completion_interruptible_timeout( &wait, - msecs_to_jiffies(timeout)) <= 0) { + msecs_to_jiffies(timeout))) <= 0) { if (rv == -ERESTARTSYS) { /* interrupted */ dev_err(&dd->pdev->dev, "Internal command [%02X] was interrupted after %lu ms\n", @@ -1497,8 +1249,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, } exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ - atomic_set(&int_cmd->active, 0); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ return rv; @@ -1529,6 +1280,37 @@ static inline void ata_swap_string(u16 *buf, unsigned int len) be16_to_cpus(&buf[i]); } +static void mtip_set_timeout(struct driver_data *dd, + struct host_to_dev_fis *fis, + unsigned int *timeout, u8 erasemode) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + if (erasemode) + *timeout = ((*(dd->port->identify + 90) * 2) * 60000); + else + *timeout = ((*(dd->port->identify + 89) * 2) * 60000); + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 120000; /* 2 minutes */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS; + break; + } +} + /* * Request the device identity information. * @@ -1576,7 +1358,7 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) sizeof(u16) * ATA_ID_WORDS, 0, GFP_KERNEL, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS) + MTIP_INT_CMD_TIMEOUT_MS) < 0) { rv = -1; goto out; @@ -1644,6 +1426,7 @@ static int mtip_standby_immediate(struct mtip_port *port) int rv; struct host_to_dev_fis fis; unsigned long start; + unsigned int timeout; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -1651,6 +1434,8 @@ static int mtip_standby_immediate(struct mtip_port *port) fis.opts = 1 << 7; fis.command = ATA_CMD_STANDBYNOW1; + mtip_set_timeout(port->dd, &fis, &timeout, 0); + start = jiffies; rv = mtip_exec_internal_command(port, &fis, @@ -1659,7 +1444,7 @@ static int mtip_standby_immediate(struct mtip_port *port) 0, 0, GFP_ATOMIC, - 15000); + timeout); dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", jiffies_to_msecs(jiffies - start)); if (rv) @@ -1705,7 +1490,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, sectors * ATA_SECT_SIZE, 0, GFP_ATOMIC, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS); + MTIP_INT_CMD_TIMEOUT_MS); } /* @@ -1998,6 +1783,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) { struct host_to_dev_fis fis; struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + unsigned int to; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -2011,6 +1797,8 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) fis.cyl_hi = command[5]; fis.device = command[6] & ~0x10; /* Clear the dev bit*/ + mtip_set_timeout(port->dd, &fis, &to, 0); + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", __func__, command[0], @@ -2029,7 +1817,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) 0, 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { + to) < 0) { return -1; } @@ -2069,6 +1857,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, u8 *buf = NULL; dma_addr_t dma_addr = 0; int rv = 0, xfer_sz = command[3]; + unsigned int to; if (xfer_sz) { if (!user_buffer) @@ -2100,6 +1889,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, fis.cyl_hi = 0xC2; } + mtip_set_timeout(port->dd, &fis, &to, 0); + if (xfer_sz) reply = (port->rxfis + RX_FIS_PIO_SETUP); else @@ -2122,7 +1913,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) + to) < 0) { rv = -EFAULT; goto exit_drive_command; @@ -2202,36 +1993,6 @@ static unsigned int implicit_sector(unsigned char command, } return rv; } -static void mtip_set_timeout(struct driver_data *dd, - struct host_to_dev_fis *fis, - unsigned int *timeout, u8 erasemode) -{ - switch (fis->command) { - case ATA_CMD_DOWNLOAD_MICRO: - *timeout = 120000; /* 2 minutes */ - break; - case ATA_CMD_SEC_ERASE_UNIT: - case 0xFC: - if (erasemode) - *timeout = ((*(dd->port->identify + 90) * 2) * 60000); - else - *timeout = ((*(dd->port->identify + 89) * 2) * 60000); - break; - case ATA_CMD_STANDBYNOW1: - *timeout = 120000; /* 2 minutes */ - break; - case 0xF7: - case 0xFA: - *timeout = 60000; /* 60 seconds */ - break; - case ATA_CMD_SMART: - *timeout = 15000; /* 15 seconds */ - break; - default: - *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; - break; - } -} /* * Executes a taskfile @@ -2606,22 +2367,21 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * return value * None */ -static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir, int unaligned) +static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, int nents, + struct blk_mq_hw_ctx *hctx) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; - struct mtip_cmd *command = &port->commands[tag]; - int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - u64 start = sector; + int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + u64 start = blk_rq_pos(rq); + unsigned int nsect = blk_rq_sectors(rq); /* Map the scatter list for DMA access */ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); command->scatter_ents = nents; - command->unaligned = unaligned; /* * The number of retries for this command before it is * reported as a failure to the upper layers. @@ -2632,8 +2392,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis = command->command; fis->type = 0x27; fis->opts = 1 << 7; - fis->command = - (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); + if (rq_data_dir(rq) == READ) + fis->command = ATA_CMD_FPDMA_READ; + else + fis->command = ATA_CMD_FPDMA_WRITE; fis->lba_low = start & 0xFF; fis->lba_mid = (start >> 8) & 0xFF; fis->lba_hi = (start >> 16) & 0xFF; @@ -2643,14 +2405,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis->device = 1 << 6; fis->features = nsect & 0xFF; fis->features_ex = (nsect >> 8) & 0xFF; - fis->sect_count = ((tag << 3) | (tag >> 5)); + fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5)); fis->sect_cnt_ex = 0; fis->control = 0; fis->res2 = 0; fis->res3 = 0; fill_command_sg(dd, command, nents); - if (unaligned) + if (command->unaligned) fis->device |= 1 << 7; /* Populate the command header */ @@ -2668,81 +2430,17 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->direction = dma_dir; /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; - - /* * To prevent this command from being issued * if an internal command is in progress or error handling is active. */ if (port->flags & MTIP_PF_PAUSE_IO) { - set_bit(tag, port->cmds_to_issue); + set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); return; } /* Issue the command to the hardware */ - mtip_issue_ncq_command(port, tag); - - return; -} - -/* - * Release a command slot. - * - * @dd Pointer to the driver data structure. - * @tag Slot tag - * - * return value - * None - */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, - int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - release_slot(dd->port, tag); - up(sem); -} - -/* - * Obtain a command slot and return its associated scatter list. - * - * @dd Pointer to the driver data structure. - * @tag Pointer to an int that will receive the allocated command - * slot tag. - * - * return value - * Pointer to the scatter list for the allocated command slot - * or NULL if no command slots are available. - */ -static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, - int *tag, int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - - /* - * It is possible that, even with this semaphore, a thread - * may think that no command slots are available. Therefore, we - * need to make an attempt to get_slot(). - */ - down(sem); - *tag = get_slot(dd->port); - - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - up(sem); - return NULL; - } - if (unlikely(*tag < 0)) { - up(sem); - return NULL; - } - - return dd->port->commands[*tag].sg; + mtip_issue_ncq_command(port, rq->tag); } /* @@ -3113,6 +2811,7 @@ static int mtip_free_orphan(struct driver_data *dd) if (dd->queue) { dd->queue->queuedata = NULL; blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } } @@ -3270,6 +2969,11 @@ static int mtip_service_thread(void *data) int ret; while (1) { + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + /* * the condition is to check neither an internal command is * is in progress nor error handling is active @@ -3277,11 +2981,12 @@ static int mtip_service_thread(void *data) wait_event_interruptible(port->svc_wait, (port->flags) && !(port->flags & MTIP_PF_PAUSE_IO)); - if (kthread_should_stop()) - goto st_out; - set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + /* If I am an orphan, start self cleanup */ if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) break; @@ -3290,6 +2995,16 @@ static int mtip_service_thread(void *data) &dd->dd_flag))) goto st_out; +restart_eh: + /* Demux bits: start with error handling */ + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) { + mtip_handle_tfe(dd); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + } + + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) + goto restart_eh; + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ @@ -3319,16 +3034,14 @@ static int mtip_service_thread(void *data) } clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); - } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + } + + if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { if (mtip_ftl_rebuild_poll(dd) < 0) set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); } - clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); - - if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) - goto st_out; } /* wait for pci remove to exit */ @@ -3365,7 +3078,6 @@ st_out: */ static void mtip_dma_free(struct driver_data *dd) { - int i; struct mtip_port *port = dd->port; if (port->block1) @@ -3376,13 +3088,6 @@ static void mtip_dma_free(struct driver_data *dd) dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, port->command_list, port->command_list_dma); } - - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - if (port->commands[i].command) - dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - port->commands[i].command, - port->commands[i].command_dma); - } } /* @@ -3396,8 +3101,6 @@ static void mtip_dma_free(struct driver_data *dd) static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - int i, rv = 0; - u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ port->block1 = @@ -3430,41 +3133,63 @@ static int mtip_dma_alloc(struct driver_data *dd) port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; - /* Setup per command SGL DMA region */ - - /* Point the command headers at the command tables */ - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - port->commands[i].command = - dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - &port->commands[i].command_dma, GFP_KERNEL); - if (!port->commands[i].command) { - rv = -ENOMEM; - mtip_dma_free(dd); - return rv; - } - memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); - - port->commands[i].command_header = port->command_list + - (sizeof(struct mtip_cmd_hdr) * i); - port->commands[i].command_header_dma = - dd->port->command_list_dma + - (sizeof(struct mtip_cmd_hdr) * i); + return 0; +} - if (host_cap_64) - port->commands[i].command_header->ctbau = - __force_bit2int cpu_to_le32( - (port->commands[i].command_dma >> 16) >> 16); +static int mtip_hw_get_identify(struct driver_data *dd) +{ + struct smart_attr attr242; + unsigned char *buf; + int rv; - port->commands[i].command_header->ctba = - __force_bit2int cpu_to_le32( - port->commands[i].command_dma & 0xFFFFFFFF); + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; - sg_init_table(port->commands[i].sg, MTIP_MAX_SG); + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); + return MTIP_FTL_REBUILD_MAGIC; + } + mtip_dump_identify(dd->port); - /* Mark command as currently inactive */ - atomic_set(&dd->port->commands[i].active, 0); + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } } - return 0; + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %u%% (%u blocks)\n", + attr242.cur, le32_to_cpu(attr242.data)); + + return rv; } /* @@ -3481,8 +3206,6 @@ static int mtip_hw_init(struct driver_data *dd) int rv; unsigned int num_command_slots; unsigned long timeout, timetaken; - unsigned char *buf; - struct smart_attr attr242; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -3513,8 +3236,6 @@ static int mtip_hw_init(struct driver_data *dd) else dd->unal_qdepth = 0; - /* Counting semaphore to track command slot usage */ - sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ @@ -3599,73 +3320,16 @@ static int mtip_hw_init(struct driver_data *dd) writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, dd->mmio + HOST_CTL); - init_timer(&dd->port->cmd_timer); init_waitqueue_head(&dd->port->svc_wait); - dd->port->cmd_timer.data = (unsigned long int) dd->port; - dd->port->cmd_timer.function = mtip_timeout_function; - mod_timer(&dd->port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); - - if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { rv = -EFAULT; goto out3; } - if (mtip_get_identify(dd->port, NULL) < 0) { - rv = -EFAULT; - goto out3; - } - mtip_dump_identify(dd->port); - - if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == - MTIP_FTL_REBUILD_MAGIC) { - set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); - return MTIP_FTL_REBUILD_MAGIC; - } - - /* check write protect, over temp and rebuild statuses */ - rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, - dd->port->log_buf, - dd->port->log_buf_dma, 1); - if (rv) { - dev_warn(&dd->pdev->dev, - "Error in READ LOG EXT (10h) command\n"); - /* non-critical error, don't fail the load */ - } else { - buf = (unsigned char *)dd->port->log_buf; - if (buf[259] & 0x1) { - dev_info(&dd->pdev->dev, - "Write protect bit is set.\n"); - set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); - } - if (buf[288] == 0xF7) { - dev_info(&dd->pdev->dev, - "Exceeded Tmax, drive in thermal shutdown.\n"); - set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); - } - if (buf[288] == 0xBF) { - dev_info(&dd->pdev->dev, - "Drive is in security locked state.\n"); - set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); - } - } - - /* get write protect progess */ - memset(&attr242, 0, sizeof(struct smart_attr)); - if (mtip_get_smart_attr(dd->port, 242, &attr242)) - dev_warn(&dd->pdev->dev, - "Unable to check write protect progress\n"); - else - dev_info(&dd->pdev->dev, - "Write protect progress: %u%% (%u blocks)\n", - attr242.cur, le32_to_cpu(attr242.data)); return rv; out3: - del_timer_sync(&dd->port->cmd_timer); - /* Disable interrupts on the HBA. */ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, dd->mmio + HOST_CTL); @@ -3685,6 +3349,22 @@ out1: return rv; } +static void mtip_standby_drive(struct driver_data *dd) +{ + if (dd->sr) + return; + + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); +} + /* * Called to deinitialize an interface. * @@ -3700,12 +3380,6 @@ static int mtip_hw_exit(struct driver_data *dd) * saves its state. */ if (!dd->sr) { - if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && - !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) - if (mtip_standby_immediate(dd->port)) - dev_warn(&dd->pdev->dev, - "STANDBY IMMEDIATE failed\n"); - /* de-initialize the port. */ mtip_deinit_port(dd->port); @@ -3714,8 +3388,6 @@ static int mtip_hw_exit(struct driver_data *dd) dd->mmio + HOST_CTL); } - del_timer_sync(&dd->port->cmd_timer); - /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); @@ -4032,100 +3704,138 @@ static const struct block_device_operations mtip_block_ops = { * * @queue Pointer to the request queue. Unused other than to obtain * the driver data structure. - * @bio Pointer to the BIO. + * @rq Pointer to the request. * */ -static void mtip_make_request(struct request_queue *queue, struct bio *bio) +static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct driver_data *dd = queue->queuedata; - struct scatterlist *sg; - struct bio_vec bvec; - struct bvec_iter iter; - int nents = 0; - int tag = 0, unaligned = 0; + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + unsigned int nents; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENXIO); - return; + return -ENXIO; } if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; + return -ENODATA; } if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) && - bio_data_dir(bio))) { - bio_endio(bio, -ENODATA); - return; - } - if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; - } - if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { - bio_endio(bio, -ENXIO); - return; + rq_data_dir(rq))) { + return -ENODATA; } + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) + return -ENODATA; + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + return -ENXIO; } - if (unlikely(bio->bi_rw & REQ_DISCARD)) { - bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector, - bio_sectors(bio))); - return; - } + if (rq->cmd_flags & REQ_DISCARD) { + int err; - if (unlikely(!bio_has_data(bio))) { - blk_queue_flush(queue, 0); - bio_endio(bio, 0); - return; + err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); + blk_mq_end_io(rq, err); + return 0; } - if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && - dd->unal_qdepth) { - if (bio->bi_iter.bi_sector % 8 != 0) - /* Unaligned on 4k boundaries */ - unaligned = 1; - else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ - unaligned = 1; + /* Create the scatter list for this request. */ + nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); + + /* Issue the read/write. */ + mtip_hw_submit_io(dd, rq, cmd, nents, hctx); + return 0; +} + +static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!dd->unal_qdepth || rq_data_dir(rq) == READ) + return false; + + /* + * If unaligned depth must be limited on this controller, mark it + * as unaligned if the IO isn't on a 4k boundary (start of length). + */ + if (blk_rq_sectors(rq) <= 64) { + if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7)) + cmd->unaligned = 1; } - sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); - if (likely(sg != NULL)) { - blk_queue_bounce(queue, &bio); + if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) + return true; - if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { - dev_warn(&dd->pdev->dev, - "Maximum number of SGL entries exceeded\n"); - bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag, unaligned); - return; - } + return false; +} - /* Create the scatter list for this bio. */ - bio_for_each_segment(bvec, bio, iter) { - sg_set_page(&sg[nents], - bvec.bv_page, - bvec.bv_len, - bvec.bv_offset); - nents++; - } +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + int ret; - /* Issue the read/write. */ - mtip_hw_submit_io(dd, - bio->bi_iter.bi_sector, - bio_sectors(bio), - nents, - tag, - bio_endio, - bio, - bio_data_dir(bio), - unaligned); - } else - bio_io_error(bio); + if (mtip_check_unal_depth(hctx, rq)) + return BLK_MQ_RQ_QUEUE_BUSY; + + ret = mtip_submit_request(hctx, rq); + if (!ret) + return BLK_MQ_RQ_QUEUE_OK; + + rq->errors = ret; + return BLK_MQ_RQ_QUEUE_ERROR; +} + +static void mtip_free_cmd(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!cmd->command) + return; + + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + cmd->command, cmd->command_dma); +} + +static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, + unsigned int request_idx, unsigned int numa_node) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + &cmd->command_dma, GFP_KERNEL); + if (!cmd->command) + return -ENOMEM; + + memset(cmd->command, 0, CMD_DMA_ALLOC_SZ); + + /* Point the command headers at the command tables. */ + cmd->command_header = dd->port->command_list + + (sizeof(struct mtip_cmd_hdr) * request_idx); + cmd->command_header_dma = dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * request_idx); + + if (host_cap_64) + cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); + + cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); + + sg_init_table(cmd->sg, MTIP_MAX_SG); + return 0; } +static struct blk_mq_ops mtip_mq_ops = { + .queue_rq = mtip_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = mtip_init_cmd, + .exit_request = mtip_free_cmd, +}; + /* * Block layer initialization function. * @@ -4148,11 +3858,7 @@ static int mtip_block_initialize(struct driver_data *dd) if (dd->disk) goto skip_create_disk; /* hw init done, before rebuild */ - /* Initialize the protocol layer. */ - wait_for_rebuild = mtip_hw_init(dd); - if (wait_for_rebuild < 0) { - dev_err(&dd->pdev->dev, - "Protocol layer initialization failed\n"); + if (mtip_hw_init(dd)) { rv = -EINVAL; goto protocol_init_error; } @@ -4194,29 +3900,53 @@ static int mtip_block_initialize(struct driver_data *dd) mtip_hw_debugfs_init(dd); - /* - * if rebuild pending, start the service thread, and delay the block - * queue creation and add_disk() - */ - if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) - goto start_service_thread; - skip_create_disk: - /* Allocate the request queue. */ - dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); - if (dd->queue == NULL) { + memset(&dd->tags, 0, sizeof(dd->tags)); + dd->tags.ops = &mtip_mq_ops; + dd->tags.nr_hw_queues = 1; + dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS; + dd->tags.reserved_tags = 1; + dd->tags.cmd_size = sizeof(struct mtip_cmd); + dd->tags.numa_node = dd->numa_node; + dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; + dd->tags.driver_data = dd; + + rv = blk_mq_alloc_tag_set(&dd->tags); + if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); rv = -ENOMEM; goto block_queue_alloc_init_error; } - /* Attach our request function to the request queue. */ - blk_queue_make_request(dd->queue, mtip_make_request); + /* Allocate the request queue. */ + dd->queue = blk_mq_init_queue(&dd->tags); + if (IS_ERR(dd->queue)) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + rv = -ENOMEM; + goto block_queue_alloc_init_error; + } dd->disk->queue = dd->queue; dd->queue->queuedata = dd; + /* Initialize the protocol layer. */ + wait_for_rebuild = mtip_hw_get_identify(dd); + if (wait_for_rebuild < 0) { + dev_err(&dd->pdev->dev, + "Protocol layer initialization failed\n"); + rv = -EINVAL; + goto init_hw_cmds_error; + } + + /* + * if rebuild pending, start the service thread, and delay the block + * queue creation and add_disk() + */ + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + goto start_service_thread; + /* Set device limits. */ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); @@ -4295,8 +4025,9 @@ kthread_run_error: del_gendisk(dd->disk); read_capacity_error: +init_hw_cmds_error: blk_cleanup_queue(dd->queue); - + blk_mq_free_tag_set(&dd->tags); block_queue_alloc_init_error: mtip_hw_debugfs_exit(dd); disk_index_error: @@ -4345,6 +4076,9 @@ static int mtip_block_remove(struct driver_data *dd) kobject_put(kobj); } } + + mtip_standby_drive(dd); + /* * Delete our gendisk structure. This also removes the device * from /dev @@ -4357,6 +4091,7 @@ static int mtip_block_remove(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } else put_disk(dd->disk); @@ -4391,6 +4126,8 @@ static int mtip_block_remove(struct driver_data *dd) */ static int mtip_block_shutdown(struct driver_data *dd) { + mtip_hw_shutdown(dd); + /* Delete our gendisk structure, and cleanup the blk queue. */ if (dd->disk) { dev_info(&dd->pdev->dev, @@ -4399,6 +4136,7 @@ static int mtip_block_shutdown(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); } else put_disk(dd->disk); dd->disk = NULL; @@ -4408,8 +4146,6 @@ static int mtip_block_shutdown(struct driver_data *dd) spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); spin_unlock(&rssd_index_lock); - - mtip_hw_shutdown(dd); return 0; } @@ -4479,6 +4215,57 @@ static DEFINE_HANDLER(5); static DEFINE_HANDLER(6); static DEFINE_HANDLER(7); +static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) +{ + int pos; + unsigned short pcie_dev_ctrl; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pos) { + pci_read_config_word(pdev, + pos + PCI_EXP_DEVCTL, + &pcie_dev_ctrl); + if (pcie_dev_ctrl & (1 << 11) || + pcie_dev_ctrl & (1 << 4)) { + dev_info(&dd->pdev->dev, + "Disabling ERO/No-Snoop on bridge device %04x:%04x\n", + pdev->vendor, pdev->device); + pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN | + PCI_EXP_DEVCTL_RELAX_EN); + pci_write_config_word(pdev, + pos + PCI_EXP_DEVCTL, + pcie_dev_ctrl); + } + } +} + +static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev) +{ + /* + * This workaround is specific to AMD/ATI chipset with a PCI upstream + * device with device id 0x5aXX + */ + if (pdev->bus && pdev->bus->self) { + if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI && + ((pdev->bus->self->device & 0xff00) == 0x5a00)) { + mtip_disable_link_opts(dd, pdev->bus->self); + } else { + /* Check further up the topology */ + struct pci_dev *parent_dev = pdev->bus->self; + if (parent_dev->bus && + parent_dev->bus->parent && + parent_dev->bus->parent->self && + parent_dev->bus->parent->self->vendor == + PCI_VENDOR_ID_ATI && + (parent_dev->bus->parent->self->device & + 0xff00) == 0x5a00) { + mtip_disable_link_opts(dd, + parent_dev->bus->parent->self); + } + } + } +} + /* * Called for each supported PCI device detected. * @@ -4630,6 +4417,8 @@ static int mtip_pci_probe(struct pci_dev *pdev, goto msi_initialize_err; } + mtip_fix_ero_nosnoop(dd, pdev); + /* Initialize the block layer. */ rv = mtip_block_initialize(dd); if (rv < 0) { @@ -4710,8 +4499,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); } - /* Cleanup the outstanding commands */ - mtip_command_cleanup(dd); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4737,8 +4524,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); - pci_dev_put(pdev); - } /* @@ -4935,13 +4720,13 @@ static int __init mtip_init(void) */ static void __exit mtip_exit(void) { - debugfs_remove_recursive(dfs_parent); - /* Release the allocated major block device number. */ unregister_blkdev(mtip_major, MTIP_DRV_NAME); /* Unregister the PCI driver. */ pci_unregister_driver(&mtip_pci_driver); + + debugfs_remove_recursive(dfs_parent); } MODULE_AUTHOR("Micron Technology, Inc"); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index ffb955e7ccb9..4b9b554234bc 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -40,9 +40,11 @@ #define MTIP_MAX_RETRIES 2 /* Various timeout values in ms */ -#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 -#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000 -#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000 +#define MTIP_NCQ_CMD_TIMEOUT_MS 15000 +#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000 +#define MTIP_INT_CMD_TIMEOUT_MS 5000 +#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \ + (MTIP_MAX_RETRIES + 1)) /* check for timeouts every 500ms */ #define MTIP_TIMEOUT_CHECK_PERIOD 500 @@ -331,12 +333,8 @@ struct mtip_cmd { */ void (*comp_func)(struct mtip_port *port, int tag, - void *data, + struct mtip_cmd *cmd, int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ int scatter_ents; /* Number of scatter list entries used */ @@ -347,10 +345,6 @@ struct mtip_cmd { int retries; /* The number of retries left for this command. */ int direction; /* Data transfer direction */ - - unsigned long comp_time; /* command completion time, in jiffies */ - - atomic_t active; /* declares if this command sent to the drive. */ }; /* Structure used to describe a port. */ @@ -436,12 +430,6 @@ struct mtip_port { * or error handling is active */ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; - /* - * Array of command slots. Structure includes pointers to the - * command header and command table, and completion function and data - * pointers. - */ - struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS]; /* Used by mtip_service_thread to wait for an event */ wait_queue_head_t svc_wait; /* @@ -452,13 +440,7 @@ struct mtip_port { /* * Timer used to complete commands that have been active for too long. */ - struct timer_list cmd_timer; unsigned long ic_pause_timer; - /* - * Semaphore used to block threads if there are no - * command slots available. - */ - struct semaphore cmd_slot; /* Semaphore to control queue depth of unaligned IOs */ struct semaphore cmd_slot_unal; @@ -485,6 +467,8 @@ struct driver_data { struct request_queue *queue; /* Our request queue. */ + struct blk_mq_tag_set tags; /* blk_mq tags */ + struct mtip_port *port; /* Pointer to the port data structure. */ unsigned product_type; /* magic value declaring the product type */ diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 091b9ea14feb..77087a29b127 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -32,6 +32,7 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; + struct blk_mq_tag_set tag_set; struct hrtimer timer; unsigned int queue_depth; spinlock_t lock; @@ -202,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) entry = llist_reverse_order(entry); do { cmd = container_of(entry, struct nullb_cmd, ll_list); - end_cmd(cmd); entry = entry->next; + end_cmd(cmd); } while (entry); } @@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_softirq_done_fn(struct request *rq) { - end_cmd(rq->special); + end_cmd(blk_mq_rq_to_pdu(rq)); } static inline void null_handle_cmd(struct nullb_cmd *cmd) @@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q) static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct nullb_cmd *cmd = rq->special; + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->rq = rq; cmd->nq = hctx->driver_data; @@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) return BLK_MQ_RQ_QUEUE_OK; } -static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index) -{ - int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes); - int tip = (reg->nr_hw_queues % nr_online_nodes); - int node = 0, i, n; - - /* - * Split submit queues evenly wrt to the number of nodes. If uneven, - * fill the first buckets with one extra, until the rest is filled with - * no extra. - */ - for (i = 0, n = 1; i < hctx_index; i++, n++) { - if (n % b_size == 0) { - n = 0; - node++; - - tip--; - if (!tip) - b_size = reg->nr_hw_queues / nr_online_nodes; - } - } - - /* - * A node might not be online, therefore map the relative node id to the - * real node id. - */ - for_each_online_node(n) { - if (!node) - break; - node--; - } - - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); -} - -static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) -{ - kfree(hctx); -} - static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { BUG_ON(!nullb); @@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = { .complete = null_softirq_done_fn, }; -static struct blk_mq_reg null_mq_reg = { - .ops = &null_mq_ops, - .queue_depth = 64, - .cmd_size = sizeof(struct nullb_cmd), - .flags = BLK_MQ_F_SHOULD_MERGE, -}; - static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); put_disk(nullb->disk); kfree(nullb); } @@ -506,7 +462,7 @@ static int null_add_dev(void) nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); if (!nullb) - return -ENOMEM; + goto out; spin_lock_init(&nullb->lock); @@ -514,49 +470,44 @@ static int null_add_dev(void) submit_queues = nr_online_nodes; if (setup_queues(nullb)) - goto err; + goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { - null_mq_reg.numa_node = home_node; - null_mq_reg.queue_depth = hw_queue_depth; - null_mq_reg.nr_hw_queues = submit_queues; - - if (use_per_node_hctx) { - null_mq_reg.ops->alloc_hctx = null_alloc_hctx; - null_mq_reg.ops->free_hctx = null_free_hctx; - } else { - null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; - null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; - } - - nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); + nullb->tag_set.ops = &null_mq_ops; + nullb->tag_set.nr_hw_queues = submit_queues; + nullb->tag_set.queue_depth = hw_queue_depth; + nullb->tag_set.numa_node = home_node; + nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); + nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + nullb->tag_set.driver_data = nullb; + + if (blk_mq_alloc_tag_set(&nullb->tag_set)) + goto out_cleanup_queues; + + nullb->q = blk_mq_init_queue(&nullb->tag_set); + if (!nullb->q) + goto out_cleanup_tags; } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_make_request(nullb->q, null_queue_bio); init_driver_queues(nullb); } else { nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_prep_rq(nullb->q, null_rq_prep_fn); - if (nullb->q) - blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); init_driver_queues(nullb); } - if (!nullb->q) - goto queue_fail; - nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { -queue_fail: - blk_cleanup_queue(nullb->q); - cleanup_queues(nullb); -err: - kfree(nullb); - return -ENOMEM; - } + if (!disk) + goto out_cleanup_blk_queue; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -579,6 +530,18 @@ err: sprintf(disk->disk_name, "nullb%d", nullb->index); add_disk(disk); return 0; + +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); +out: + return -ENOMEM; } static int __init null_init(void) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index e76bdc074dbe..719cb1bc1640 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q) pcd_current = cd; pcd_sector = blk_rq_pos(pcd_req); pcd_count = blk_rq_cur_sectors(pcd_req); - pcd_buf = pcd_req->buffer; + pcd_buf = bio_data(pcd_req->bio); pcd_busy = 1; ps_set_intr(do_pcd_read, NULL, 0, nice); return; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 19ad8f0c83ef..fea7e76a00de 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -454,7 +454,7 @@ static enum action do_pd_io_start(void) if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) return Fail; pd_run = blk_rq_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); pd_retries = 0; if (pd_cmd == READ) return do_pd_read_start(); @@ -485,7 +485,7 @@ static int pd_next_buf(void) spin_lock_irqsave(&pd_lock, saved_flags); __blk_end_request_cur(pd_req, 0); pd_count = blk_rq_cur_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); spin_unlock_irqrestore(&pd_lock, saved_flags); return 0; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f5c86d523ba0..9a15fd3c9349 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -795,7 +795,7 @@ repeat: } pf_cmd = rq_data_dir(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); pf_retries = 0; pf_busy = 1; @@ -827,7 +827,7 @@ static int pf_next_buf(void) if (!pf_req) return 1; pf_count = blk_rq_cur_sectors(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); } return 0; } diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a69dd93d1bd5..608532d3f8c9 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, req = skreq->req; blk_add_request_payload(req, page, len); - req->buffer = buf; } static void skd_request_fn_not_online(struct request_queue *q); @@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) break; } skreq->discard_page = 1; + req->completion_data = page; skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { @@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev, (skreq->discard_page == 1)) { pr_debug("%s:%s:%d, free the page!", skdev->name, __func__, __LINE__); - free_page((unsigned long)req->buffer); - req->buffer = NULL; + __free_page(req->completion_data); } if (unlikely(error)) { @@ -3945,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev) for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) entries[i].entry = i; - rc = pci_enable_msix_range(pdev, entries, - SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT); - if (rc < 0) { + rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT); + if (rc) { pr_err("(%s): failed to enable MSI-X %d\n", skd_name(skdev), rc); goto msix_out; } - skdev->msix_count = rc; + skdev->msix_count = SKD_MAX_MSIX_COUNT; skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * skdev->msix_count, GFP_KERNEL); if (!skdev->msix_entries) { diff --git a/drivers/block/swim.c b/drivers/block/swim.c index b02d53a399f3..6b44bbe528b7 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q) case READ: err = floppy_read_sectors(fs, blk_rq_pos(req), blk_rq_cur_sectors(req), - req->buffer); + bio_data(req->bio)); break; } done: diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c74f7b56e7c4..523ee8fd4c15 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs) swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", req->rq_disk->disk_name, req->cmd, (long)blk_rq_pos(req), blk_rq_sectors(req), - req->buffer); + bio_data(req->bio)); swim3_dbg(" errors=%d current_nr_sectors=%u\n", req->errors, blk_rq_cur_sectors(req)); #endif @@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs) /* Set up 3 dma commands: write preamble, data, postamble */ init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); ++cp; - init_dma(cp, OUTPUT_MORE, req->buffer, 512); + init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512); ++cp; init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); } else { - init_dma(cp, INPUT_LAST, req->buffer, n * 512); + init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512); } ++cp; out_le16(&cp->command, DBDMA_STOP); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6d8a87f252de..c8f286e8d80f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -30,6 +30,9 @@ struct virtio_blk /* The disk structure for the kernel. */ struct gendisk *disk; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* Process context for config space updates */ struct work_struct config_work; @@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { @@ -144,17 +147,17 @@ static void virtblk_done(struct virtqueue *vq) if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - spin_unlock_irqrestore(&vblk->vq_lock, flags); /* In case queue is stopped waiting for more buffers. */ if (req_done) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + spin_unlock_irqrestore(&vblk->vq_lock, flags); } static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; const bool last = (req->cmd_flags & REQ_END) != 0; @@ -202,8 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); if (err) { virtqueue_kick(vblk->vq); - spin_unlock_irqrestore(&vblk->vq_lock, flags); blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&vblk->vq_lock, flags); /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) @@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw = __ATTR(cache_type, S_IRUGO|S_IWUSR, virtblk_cache_type_show, virtblk_cache_type_store); -static struct blk_mq_ops virtio_mq_ops = { - .queue_rq = virtio_queue_rq, - .map_queue = blk_mq_map_queue, - .alloc_hctx = blk_mq_alloc_single_hw_queue, - .free_hctx = blk_mq_free_single_hw_queue, - .complete = virtblk_request_done, -}; - -static struct blk_mq_reg virtio_mq_reg = { - .ops = &virtio_mq_ops, - .nr_hw_queues = 1, - .queue_depth = 0, /* Set in virtblk_probe */ - .numa_node = NUMA_NO_NODE, - .flags = BLK_MQ_F_SHOULD_MERGE, -}; -module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444); - -static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, - struct request *rq, unsigned int nr) +static int virtblk_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) { struct virtio_blk *vblk = data; - struct virtblk_req *vbr = rq->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); sg_init_table(vbr->sg, vblk->sg_elems); return 0; } +static struct blk_mq_ops virtio_mq_ops = { + .queue_rq = virtio_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = virtblk_request_done, + .init_request = virtblk_init_request, +}; + +static unsigned int virtblk_queue_depth; +module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev) } /* Default queue sizing is to fill the ring. */ - if (!virtio_mq_reg.queue_depth) { - virtio_mq_reg.queue_depth = vblk->vq->num_free; + if (!virtblk_queue_depth) { + virtblk_queue_depth = vblk->vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - virtio_mq_reg.queue_depth /= 2; + virtblk_queue_depth /= 2; } - virtio_mq_reg.cmd_size = + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = 1; + vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; + vblk->tag_set.driver_data = vblk; - q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_put_disk; + + q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); if (!q) { err = -ENOMEM; - goto out_put_disk; + goto out_free_tags; } - blk_mq_init_commands(q, virtblk_init_vbr, vblk); - q->queuedata = vblk; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); @@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev) out_del_disk: del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); +out_free_tags: + blk_mq_free_tag_set(&vblk->tag_set); out_put_disk: put_disk(vblk->disk); out_free_vq: @@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev) del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); + blk_mq_free_tag_set(&vblk->tag_set); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); @@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev) vblk->config_enable = true; ret = init_vq(vdev->priv); if (!ret) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); return ret; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 25c11ad34184..5deb235bd18f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq) } pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) buffer:%p [%s]\n", + "(%u/%u) [%s]\n", req, req->cmd, (unsigned long)blk_rq_pos(req), blk_rq_cur_sectors(req), blk_rq_sectors(req), - req->buffer, rq_data_dir(req) ? "write" : "read"); + rq_data_dir(req) ? "write" : "read"); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 1393b8871a28..ab3ea62e5dfc 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace) rq_data_dir(req)); ace->req = req; - ace->data_ptr = req->buffer; + ace->data_ptr = bio_data(req->bio); ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); @@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace) * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); */ - ace->data_ptr = ace->req->buffer; + ace->data_ptr = bio_data(ace->req->bio); ace->data_count = blk_rq_cur_sectors(ace->req) * 16; ace_fsm_yieldirq(ace); break; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 27de5046708a..968f9e52effa 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q) while (len) { unsigned long addr = start & Z2RAM_CHUNKMASK; unsigned long size = Z2RAM_CHUNKSIZE - addr; + void *buffer = bio_data(req->bio); + if (len < size) size = len; addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; if (rq_data_dir(req) == READ) - memcpy(req->buffer, (char *)addr, size); + memcpy(buffer, (char *)addr, size); else - memcpy((char *)addr, req->buffer, size); + memcpy((char *)addr, buffer, size); start += size; len -= size; } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 8a3aff724d98..49ac5662585b 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -312,36 +312,24 @@ static const char *mrw_format_status[] = { static const char *mrw_address_space[] = { "DMA", "GAA" }; -#if (ERRLOGMASK!=CD_NOTHING) -#define cdinfo(type, fmt, args...) \ +#if (ERRLOGMASK != CD_NOTHING) +#define cd_dbg(type, fmt, ...) \ do { \ if ((ERRLOGMASK & type) || debug == 1) \ - pr_info(fmt, ##args); \ + pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #else -#define cdinfo(type, fmt, args...) \ +#define cd_dbg(type, fmt, ...) \ do { \ if (0 && (ERRLOGMASK & type) || debug == 1) \ - pr_info(fmt, ##args); \ + pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #endif -/* These are used to simplify getting data in from and back to user land */ -#define IOCTL_IN(arg, type, in) \ - if (copy_from_user(&(in), (type __user *) (arg), sizeof (in))) \ - return -EFAULT; - -#define IOCTL_OUT(arg, type, out) \ - if (copy_to_user((type __user *) (arg), &(out), sizeof (out))) \ - return -EFAULT; - /* The (cdo->capability & ~cdi->mask & CDC_XXX) construct was used in a lot of places. This macro makes the code more clear. */ #define CDROM_CAN(type) (cdi->ops->capability & ~cdi->mask & (type)) -/* used in the audio ioctls */ -#define CHECKAUDIO if ((ret=check_for_audio_disc(cdi, cdo))) return ret - /* * Another popular OS uses 7 seconds as the hard timeout for default * commands, so it is a good choice for us as well. @@ -349,21 +337,6 @@ do { \ #define CDROM_DEF_TIMEOUT (7 * HZ) /* Not-exported routines. */ -static int open_for_data(struct cdrom_device_info * cdi); -static int check_for_audio_disc(struct cdrom_device_info * cdi, - struct cdrom_device_ops * cdo); -static void sanitize_format(union cdrom_addr *addr, - u_char * curr, u_char requested); -static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, - unsigned long arg); - -int cdrom_get_last_written(struct cdrom_device_info *, long *); -static int cdrom_get_next_writable(struct cdrom_device_info *, long *); -static void cdrom_count_tracks(struct cdrom_device_info *, tracktype*); - -static int cdrom_mrw_exit(struct cdrom_device_info *cdi); - -static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di); static void cdrom_sysctl_register(void); @@ -382,113 +355,65 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, return -EIO; } -/* This macro makes sure we don't have to check on cdrom_device_ops - * existence in the run-time routines below. Change_capability is a - * hack to have the capability flags defined const, while we can still - * change it here without gcc complaining at every line. - */ -#define ENSURE(call, bits) if (cdo->call == NULL) *change_capability &= ~(bits) - -int register_cdrom(struct cdrom_device_info *cdi) -{ - static char banner_printed; - struct cdrom_device_ops *cdo = cdi->ops; - int *change_capability = (int *)&cdo->capability; /* hack */ - - cdinfo(CD_OPEN, "entering register_cdrom\n"); - - if (cdo->open == NULL || cdo->release == NULL) - return -EINVAL; - if (!banner_printed) { - pr_info("Uniform CD-ROM driver " REVISION "\n"); - banner_printed = 1; - cdrom_sysctl_register(); - } - - ENSURE(drive_status, CDC_DRIVE_STATUS ); - if (cdo->check_events == NULL && cdo->media_changed == NULL) - *change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC); - ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY); - ENSURE(lock_door, CDC_LOCK); - ENSURE(select_speed, CDC_SELECT_SPEED); - ENSURE(get_last_session, CDC_MULTI_SESSION); - ENSURE(get_mcn, CDC_MCN); - ENSURE(reset, CDC_RESET); - ENSURE(generic_packet, CDC_GENERIC_PACKET); - cdi->mc_flags = 0; - cdo->n_minors = 0; - cdi->options = CDO_USE_FFLAGS; - - if (autoclose==1 && CDROM_CAN(CDC_CLOSE_TRAY)) - cdi->options |= (int) CDO_AUTO_CLOSE; - if (autoeject==1 && CDROM_CAN(CDC_OPEN_TRAY)) - cdi->options |= (int) CDO_AUTO_EJECT; - if (lockdoor==1) - cdi->options |= (int) CDO_LOCK; - if (check_media_type==1) - cdi->options |= (int) CDO_CHECK_TYPE; - - if (CDROM_CAN(CDC_MRW_W)) - cdi->exit = cdrom_mrw_exit; - - if (cdi->disk) - cdi->cdda_method = CDDA_BPC_FULL; - else - cdi->cdda_method = CDDA_OLD; - - if (!cdo->generic_packet) - cdo->generic_packet = cdrom_dummy_generic_packet; - - cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); - mutex_lock(&cdrom_mutex); - list_add(&cdi->list, &cdrom_list); - mutex_unlock(&cdrom_mutex); - return 0; -} -#undef ENSURE - -void unregister_cdrom(struct cdrom_device_info *cdi) +static int cdrom_flush_cache(struct cdrom_device_info *cdi) { - cdinfo(CD_OPEN, "entering unregister_cdrom\n"); + struct packet_command cgc; - mutex_lock(&cdrom_mutex); - list_del(&cdi->list); - mutex_unlock(&cdrom_mutex); + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.cmd[0] = GPCMD_FLUSH_CACHE; - if (cdi->exit) - cdi->exit(cdi); + cgc.timeout = 5 * 60 * HZ; - cdi->ops->n_minors--; - cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); + return cdi->ops->generic_packet(cdi, &cgc); } -int cdrom_get_media_event(struct cdrom_device_info *cdi, - struct media_event_desc *med) +/* requires CD R/RW */ +static int cdrom_get_disc_info(struct cdrom_device_info *cdi, + disc_information *di) { + struct cdrom_device_ops *cdo = cdi->ops; struct packet_command cgc; - unsigned char buffer[8]; - struct event_header *eh = (struct event_header *) buffer; + int ret, buflen; - init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION; - cgc.cmd[1] = 1; /* IMMED */ - cgc.cmd[4] = 1 << 4; /* media event */ - cgc.cmd[8] = sizeof(buffer); + /* set up command and get the disc info */ + init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_DISC_INFO; + cgc.cmd[8] = cgc.buflen = 2; cgc.quiet = 1; - if (cdi->ops->generic_packet(cdi, &cgc)) - return 1; + ret = cdo->generic_packet(cdi, &cgc); + if (ret) + return ret; - if (be16_to_cpu(eh->data_len) < sizeof(*med)) - return 1; + /* not all drives have the same disc_info length, so requeue + * packet with the length the drive tells us it can supply + */ + buflen = be16_to_cpu(di->disc_information_length) + + sizeof(di->disc_information_length); - if (eh->nea || eh->notification_class != 0x4) - return 1; + if (buflen > sizeof(disc_information)) + buflen = sizeof(disc_information); - memcpy(med, &buffer[sizeof(*eh)], sizeof(*med)); - return 0; + cgc.cmd[8] = cgc.buflen = buflen; + ret = cdo->generic_packet(cdi, &cgc); + if (ret) + return ret; + + /* return actual fill size */ + return buflen; } +/* This macro makes sure we don't have to check on cdrom_device_ops + * existence in the run-time routines below. Change_capability is a + * hack to have the capability flags defined const, while we can still + * change it here without gcc complaining at every line. + */ +#define ENSURE(call, bits) \ +do { \ + if (cdo->call == NULL) \ + *change_capability &= ~(bits); \ +} while (0) + /* * the first prototypes used 0x2c as the page code for the mrw mode page, * subsequently this was changed to 0x03. probe the one used by this drive @@ -605,18 +530,6 @@ static int cdrom_mrw_bgformat_susp(struct cdrom_device_info *cdi, int immed) return cdi->ops->generic_packet(cdi, &cgc); } -static int cdrom_flush_cache(struct cdrom_device_info *cdi) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_FLUSH_CACHE; - - cgc.timeout = 5 * 60 * HZ; - - return cdi->ops->generic_packet(cdi, &cgc); -} - static int cdrom_mrw_exit(struct cdrom_device_info *cdi) { disc_information di; @@ -650,17 +563,19 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) cgc.buffer = buffer; cgc.buflen = sizeof(buffer); - if ((ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0))) + ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0); + if (ret) return ret; - mph = (struct mode_page_header *) buffer; + mph = (struct mode_page_header *)buffer; offset = be16_to_cpu(mph->desc_length); size = be16_to_cpu(mph->mode_data_length) + 2; buffer[offset + 3] = space; cgc.buflen = size; - if ((ret = cdrom_mode_select(cdi, &cgc))) + ret = cdrom_mode_select(cdi, &cgc); + if (ret) return ret; pr_info("%s: mrw address space %s selected\n", @@ -668,6 +583,106 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) return 0; } +int register_cdrom(struct cdrom_device_info *cdi) +{ + static char banner_printed; + struct cdrom_device_ops *cdo = cdi->ops; + int *change_capability = (int *)&cdo->capability; /* hack */ + + cd_dbg(CD_OPEN, "entering register_cdrom\n"); + + if (cdo->open == NULL || cdo->release == NULL) + return -EINVAL; + if (!banner_printed) { + pr_info("Uniform CD-ROM driver " REVISION "\n"); + banner_printed = 1; + cdrom_sysctl_register(); + } + + ENSURE(drive_status, CDC_DRIVE_STATUS); + if (cdo->check_events == NULL && cdo->media_changed == NULL) + *change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC); + ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY); + ENSURE(lock_door, CDC_LOCK); + ENSURE(select_speed, CDC_SELECT_SPEED); + ENSURE(get_last_session, CDC_MULTI_SESSION); + ENSURE(get_mcn, CDC_MCN); + ENSURE(reset, CDC_RESET); + ENSURE(generic_packet, CDC_GENERIC_PACKET); + cdi->mc_flags = 0; + cdo->n_minors = 0; + cdi->options = CDO_USE_FFLAGS; + + if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY)) + cdi->options |= (int) CDO_AUTO_CLOSE; + if (autoeject == 1 && CDROM_CAN(CDC_OPEN_TRAY)) + cdi->options |= (int) CDO_AUTO_EJECT; + if (lockdoor == 1) + cdi->options |= (int) CDO_LOCK; + if (check_media_type == 1) + cdi->options |= (int) CDO_CHECK_TYPE; + + if (CDROM_CAN(CDC_MRW_W)) + cdi->exit = cdrom_mrw_exit; + + if (cdi->disk) + cdi->cdda_method = CDDA_BPC_FULL; + else + cdi->cdda_method = CDDA_OLD; + + if (!cdo->generic_packet) + cdo->generic_packet = cdrom_dummy_generic_packet; + + cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); + mutex_lock(&cdrom_mutex); + list_add(&cdi->list, &cdrom_list); + mutex_unlock(&cdrom_mutex); + return 0; +} +#undef ENSURE + +void unregister_cdrom(struct cdrom_device_info *cdi) +{ + cd_dbg(CD_OPEN, "entering unregister_cdrom\n"); + + mutex_lock(&cdrom_mutex); + list_del(&cdi->list); + mutex_unlock(&cdrom_mutex); + + if (cdi->exit) + cdi->exit(cdi); + + cdi->ops->n_minors--; + cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); +} + +int cdrom_get_media_event(struct cdrom_device_info *cdi, + struct media_event_desc *med) +{ + struct packet_command cgc; + unsigned char buffer[8]; + struct event_header *eh = (struct event_header *)buffer; + + init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION; + cgc.cmd[1] = 1; /* IMMED */ + cgc.cmd[4] = 1 << 4; /* media event */ + cgc.cmd[8] = sizeof(buffer); + cgc.quiet = 1; + + if (cdi->ops->generic_packet(cdi, &cgc)) + return 1; + + if (be16_to_cpu(eh->data_len) < sizeof(*med)) + return 1; + + if (eh->nea || eh->notification_class != 0x4) + return 1; + + memcpy(med, &buffer[sizeof(*eh)], sizeof(*med)); + return 0; +} + static int cdrom_get_random_writable(struct cdrom_device_info *cdi, struct rwrt_feature_desc *rfd) { @@ -839,7 +854,7 @@ static int cdrom_ram_open_write(struct cdrom_device_info *cdi) else if (CDF_RWRT == be16_to_cpu(rfd.feature_code)) ret = !rfd.curr; - cdinfo(CD_OPEN, "can open for random write\n"); + cd_dbg(CD_OPEN, "can open for random write\n"); return ret; } @@ -928,12 +943,12 @@ static void cdrom_dvd_rw_close_write(struct cdrom_device_info *cdi) struct packet_command cgc; if (cdi->mmc3_profile != 0x1a) { - cdinfo(CD_CLOSE, "%s: No DVD+RW\n", cdi->name); + cd_dbg(CD_CLOSE, "%s: No DVD+RW\n", cdi->name); return; } if (!cdi->media_written) { - cdinfo(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name); + cd_dbg(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name); return; } @@ -969,82 +984,74 @@ static int cdrom_close_write(struct cdrom_device_info *cdi) #endif } -/* We use the open-option O_NONBLOCK to indicate that the - * purpose of opening is only for subsequent ioctl() calls; no device - * integrity checks are performed. - * - * We hope that all cd-player programs will adopt this convention. It - * is in their own interest: device control becomes a lot easier - * this way. - */ -int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode) +/* badly broken, I know. Is due for a fixup anytime. */ +static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype *tracks) { - int ret; - - cdinfo(CD_OPEN, "entering cdrom_open\n"); - - /* open is event synchronization point, check events first */ - check_disk_change(bdev); - - /* if this was a O_NONBLOCK open and we should honor the flags, - * do a quick open without drive/disc integrity checks. */ - cdi->use_count++; - if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) { - ret = cdi->ops->open(cdi, 1); - } else { - ret = open_for_data(cdi); - if (ret) - goto err; - cdrom_mmc3_profile(cdi); - if (mode & FMODE_WRITE) { - ret = -EROFS; - if (cdrom_open_write(cdi)) - goto err_release; - if (!CDROM_CAN(CDC_RAM)) - goto err_release; - ret = 0; - cdi->media_written = 0; - } + struct cdrom_tochdr header; + struct cdrom_tocentry entry; + int ret, i; + tracks->data = 0; + tracks->audio = 0; + tracks->cdi = 0; + tracks->xa = 0; + tracks->error = 0; + cd_dbg(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n"); + /* Grab the TOC header so we can see how many tracks there are */ + ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header); + if (ret) { + if (ret == -ENOMEDIUM) + tracks->error = CDS_NO_DISC; + else + tracks->error = CDS_NO_INFO; + return; } - - if (ret) - goto err; - - cdinfo(CD_OPEN, "Use count for \"/dev/%s\" now %d\n", - cdi->name, cdi->use_count); - return 0; -err_release: - if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { - cdi->ops->lock_door(cdi, 0); - cdinfo(CD_OPEN, "door unlocked.\n"); + /* check what type of tracks are on this disc */ + entry.cdte_format = CDROM_MSF; + for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) { + entry.cdte_track = i; + if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) { + tracks->error = CDS_NO_INFO; + return; + } + if (entry.cdte_ctrl & CDROM_DATA_TRACK) { + if (entry.cdte_format == 0x10) + tracks->cdi++; + else if (entry.cdte_format == 0x20) + tracks->xa++; + else + tracks->data++; + } else { + tracks->audio++; + } + cd_dbg(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n", + i, entry.cdte_format, entry.cdte_ctrl); } - cdi->ops->release(cdi); -err: - cdi->use_count--; - return ret; + cd_dbg(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n", + header.cdth_trk1, tracks->audio, tracks->data, + tracks->cdi, tracks->xa); } static -int open_for_data(struct cdrom_device_info * cdi) +int open_for_data(struct cdrom_device_info *cdi) { int ret; struct cdrom_device_ops *cdo = cdi->ops; tracktype tracks; - cdinfo(CD_OPEN, "entering open_for_data\n"); + cd_dbg(CD_OPEN, "entering open_for_data\n"); /* Check if the driver can report drive status. If it can, we can do clever things. If it can't, well, we at least tried! */ if (cdo->drive_status != NULL) { ret = cdo->drive_status(cdi, CDSL_CURRENT); - cdinfo(CD_OPEN, "drive_status=%d\n", ret); + cd_dbg(CD_OPEN, "drive_status=%d\n", ret); if (ret == CDS_TRAY_OPEN) { - cdinfo(CD_OPEN, "the tray is open...\n"); + cd_dbg(CD_OPEN, "the tray is open...\n"); /* can/may i close it? */ if (CDROM_CAN(CDC_CLOSE_TRAY) && cdi->options & CDO_AUTO_CLOSE) { - cdinfo(CD_OPEN, "trying to close the tray.\n"); + cd_dbg(CD_OPEN, "trying to close the tray\n"); ret=cdo->tray_move(cdi,0); if (ret) { - cdinfo(CD_OPEN, "bummer. tried to close the tray but failed.\n"); + cd_dbg(CD_OPEN, "bummer. tried to close the tray but failed.\n"); /* Ignore the error from the low level driver. We don't care why it couldn't close the tray. We only care @@ -1054,19 +1061,19 @@ int open_for_data(struct cdrom_device_info * cdi) goto clean_up_and_return; } } else { - cdinfo(CD_OPEN, "bummer. this drive can't close the tray.\n"); + cd_dbg(CD_OPEN, "bummer. this drive can't close the tray.\n"); ret=-ENOMEDIUM; goto clean_up_and_return; } /* Ok, the door should be closed now.. Check again */ ret = cdo->drive_status(cdi, CDSL_CURRENT); if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) { - cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); - cdinfo(CD_OPEN, "tray might not contain a medium.\n"); + cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n"); + cd_dbg(CD_OPEN, "tray might not contain a medium\n"); ret=-ENOMEDIUM; goto clean_up_and_return; } - cdinfo(CD_OPEN, "the tray is now closed.\n"); + cd_dbg(CD_OPEN, "the tray is now closed\n"); } /* the door should be closed now, check for the disc */ ret = cdo->drive_status(cdi, CDSL_CURRENT); @@ -1077,7 +1084,7 @@ int open_for_data(struct cdrom_device_info * cdi) } cdrom_count_tracks(cdi, &tracks); if (tracks.error == CDS_NO_DISC) { - cdinfo(CD_OPEN, "bummer. no disc.\n"); + cd_dbg(CD_OPEN, "bummer. no disc.\n"); ret=-ENOMEDIUM; goto clean_up_and_return; } @@ -1087,34 +1094,34 @@ int open_for_data(struct cdrom_device_info * cdi) if (cdi->options & CDO_CHECK_TYPE) { /* give people a warning shot, now that CDO_CHECK_TYPE is the default case! */ - cdinfo(CD_OPEN, "bummer. wrong media type.\n"); - cdinfo(CD_WARNING, "pid %d must open device O_NONBLOCK!\n", - (unsigned int)task_pid_nr(current)); + cd_dbg(CD_OPEN, "bummer. wrong media type.\n"); + cd_dbg(CD_WARNING, "pid %d must open device O_NONBLOCK!\n", + (unsigned int)task_pid_nr(current)); ret=-EMEDIUMTYPE; goto clean_up_and_return; } else { - cdinfo(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set.\n"); + cd_dbg(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set\n"); } } - cdinfo(CD_OPEN, "all seems well, opening the device.\n"); + cd_dbg(CD_OPEN, "all seems well, opening the devicen"); /* all seems well, we can open the device */ ret = cdo->open(cdi, 0); /* open for data */ - cdinfo(CD_OPEN, "opening the device gave me %d.\n", ret); + cd_dbg(CD_OPEN, "opening the device gave me %d\n", ret); /* After all this careful checking, we shouldn't have problems opening the device, but we don't want the device locked if this somehow fails... */ if (ret) { - cdinfo(CD_OPEN, "open device failed.\n"); + cd_dbg(CD_OPEN, "open device failed\n"); goto clean_up_and_return; } if (CDROM_CAN(CDC_LOCK) && (cdi->options & CDO_LOCK)) { cdo->lock_door(cdi, 1); - cdinfo(CD_OPEN, "door locked.\n"); + cd_dbg(CD_OPEN, "door locked\n"); } - cdinfo(CD_OPEN, "device opened successfully.\n"); + cd_dbg(CD_OPEN, "device opened successfully\n"); return ret; /* Something failed. Try to unlock the drive, because some drivers @@ -1123,14 +1130,70 @@ int open_for_data(struct cdrom_device_info * cdi) This ensures that the drive gets unlocked after a mount fails. This is a goto to avoid bloating the driver with redundant code. */ clean_up_and_return: - cdinfo(CD_OPEN, "open failed.\n"); + cd_dbg(CD_OPEN, "open failed\n"); if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { cdo->lock_door(cdi, 0); - cdinfo(CD_OPEN, "door unlocked.\n"); + cd_dbg(CD_OPEN, "door unlocked\n"); } return ret; } +/* We use the open-option O_NONBLOCK to indicate that the + * purpose of opening is only for subsequent ioctl() calls; no device + * integrity checks are performed. + * + * We hope that all cd-player programs will adopt this convention. It + * is in their own interest: device control becomes a lot easier + * this way. + */ +int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, + fmode_t mode) +{ + int ret; + + cd_dbg(CD_OPEN, "entering cdrom_open\n"); + + /* open is event synchronization point, check events first */ + check_disk_change(bdev); + + /* if this was a O_NONBLOCK open and we should honor the flags, + * do a quick open without drive/disc integrity checks. */ + cdi->use_count++; + if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) { + ret = cdi->ops->open(cdi, 1); + } else { + ret = open_for_data(cdi); + if (ret) + goto err; + cdrom_mmc3_profile(cdi); + if (mode & FMODE_WRITE) { + ret = -EROFS; + if (cdrom_open_write(cdi)) + goto err_release; + if (!CDROM_CAN(CDC_RAM)) + goto err_release; + ret = 0; + cdi->media_written = 0; + } + } + + if (ret) + goto err; + + cd_dbg(CD_OPEN, "Use count for \"/dev/%s\" now %d\n", + cdi->name, cdi->use_count); + return 0; +err_release: + if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { + cdi->ops->lock_door(cdi, 0); + cd_dbg(CD_OPEN, "door unlocked\n"); + } + cdi->ops->release(cdi); +err: + cdi->use_count--; + return ret; +} + /* This code is similar to that in open_for_data. The routine is called whenever an audio play operation is requested. */ @@ -1139,21 +1202,21 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi, { int ret; tracktype tracks; - cdinfo(CD_OPEN, "entering check_for_audio_disc\n"); + cd_dbg(CD_OPEN, "entering check_for_audio_disc\n"); if (!(cdi->options & CDO_CHECK_TYPE)) return 0; if (cdo->drive_status != NULL) { ret = cdo->drive_status(cdi, CDSL_CURRENT); - cdinfo(CD_OPEN, "drive_status=%d\n", ret); + cd_dbg(CD_OPEN, "drive_status=%d\n", ret); if (ret == CDS_TRAY_OPEN) { - cdinfo(CD_OPEN, "the tray is open...\n"); + cd_dbg(CD_OPEN, "the tray is open...\n"); /* can/may i close it? */ if (CDROM_CAN(CDC_CLOSE_TRAY) && cdi->options & CDO_AUTO_CLOSE) { - cdinfo(CD_OPEN, "trying to close the tray.\n"); + cd_dbg(CD_OPEN, "trying to close the tray\n"); ret=cdo->tray_move(cdi,0); if (ret) { - cdinfo(CD_OPEN, "bummer. tried to close tray but failed.\n"); + cd_dbg(CD_OPEN, "bummer. tried to close tray but failed.\n"); /* Ignore the error from the low level driver. We don't care why it couldn't close the tray. We only care @@ -1162,20 +1225,20 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi, return -ENOMEDIUM; } } else { - cdinfo(CD_OPEN, "bummer. this driver can't close the tray.\n"); + cd_dbg(CD_OPEN, "bummer. this driver can't close the tray.\n"); return -ENOMEDIUM; } /* Ok, the door should be closed now.. Check again */ ret = cdo->drive_status(cdi, CDSL_CURRENT); if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) { - cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); + cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n"); return -ENOMEDIUM; } if (ret!=CDS_DISC_OK) { - cdinfo(CD_OPEN, "bummer. disc isn't ready.\n"); + cd_dbg(CD_OPEN, "bummer. disc isn't ready.\n"); return -EIO; } - cdinfo(CD_OPEN, "the tray is now closed.\n"); + cd_dbg(CD_OPEN, "the tray is now closed\n"); } } cdrom_count_tracks(cdi, &tracks); @@ -1193,17 +1256,18 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) struct cdrom_device_ops *cdo = cdi->ops; int opened_for_data; - cdinfo(CD_CLOSE, "entering cdrom_release\n"); + cd_dbg(CD_CLOSE, "entering cdrom_release\n"); if (cdi->use_count > 0) cdi->use_count--; if (cdi->use_count == 0) { - cdinfo(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", cdi->name); + cd_dbg(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", + cdi->name); cdrom_dvd_rw_close_write(cdi); if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) { - cdinfo(CD_CLOSE, "Unlocking door!\n"); + cd_dbg(CD_CLOSE, "Unlocking door!\n"); cdo->lock_door(cdi, 0); } } @@ -1262,7 +1326,7 @@ static int cdrom_slot_status(struct cdrom_device_info *cdi, int slot) struct cdrom_changer_info *info; int ret; - cdinfo(CD_CHANGER, "entering cdrom_slot_status()\n"); + cd_dbg(CD_CHANGER, "entering cdrom_slot_status()\n"); if (cdi->sanyo_slot) return CDS_NO_INFO; @@ -1292,7 +1356,7 @@ int cdrom_number_of_slots(struct cdrom_device_info *cdi) int nslots = 1; struct cdrom_changer_info *info; - cdinfo(CD_CHANGER, "entering cdrom_number_of_slots()\n"); + cd_dbg(CD_CHANGER, "entering cdrom_number_of_slots()\n"); /* cdrom_read_mech_status requires a valid value for capacity: */ cdi->capacity = 0; @@ -1313,7 +1377,7 @@ static int cdrom_load_unload(struct cdrom_device_info *cdi, int slot) { struct packet_command cgc; - cdinfo(CD_CHANGER, "entering cdrom_load_unload()\n"); + cd_dbg(CD_CHANGER, "entering cdrom_load_unload()\n"); if (cdi->sanyo_slot && slot < 0) return 0; @@ -1342,7 +1406,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot) int curslot; int ret; - cdinfo(CD_CHANGER, "entering cdrom_select_disc()\n"); + cd_dbg(CD_CHANGER, "entering cdrom_select_disc()\n"); if (!CDROM_CAN(CDC_SELECT_DISC)) return -EDRIVE_CANT_DO_THIS; @@ -1476,51 +1540,6 @@ int cdrom_media_changed(struct cdrom_device_info *cdi) return media_changed(cdi, 0); } -/* badly broken, I know. Is due for a fixup anytime. */ -static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks) -{ - struct cdrom_tochdr header; - struct cdrom_tocentry entry; - int ret, i; - tracks->data=0; - tracks->audio=0; - tracks->cdi=0; - tracks->xa=0; - tracks->error=0; - cdinfo(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n"); - /* Grab the TOC header so we can see how many tracks there are */ - if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header))) { - if (ret == -ENOMEDIUM) - tracks->error = CDS_NO_DISC; - else - tracks->error = CDS_NO_INFO; - return; - } - /* check what type of tracks are on this disc */ - entry.cdte_format = CDROM_MSF; - for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) { - entry.cdte_track = i; - if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) { - tracks->error=CDS_NO_INFO; - return; - } - if (entry.cdte_ctrl & CDROM_DATA_TRACK) { - if (entry.cdte_format == 0x10) - tracks->cdi++; - else if (entry.cdte_format == 0x20) - tracks->xa++; - else - tracks->data++; - } else - tracks->audio++; - cdinfo(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n", - i, entry.cdte_format, entry.cdte_ctrl); - } - cdinfo(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n", - header.cdth_trk1, tracks->audio, tracks->data, - tracks->cdi, tracks->xa); -} - /* Requests to the low-level drivers will /always/ be done in the following format convention: @@ -1632,7 +1651,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) switch (ai->type) { /* LU data send */ case DVD_LU_SEND_AGID: - cdinfo(CD_DVD, "entering DVD_LU_SEND_AGID\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_AGID\n"); cgc.quiet = 1; setup_report_key(&cgc, ai->lsa.agid, 0); @@ -1644,7 +1663,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) break; case DVD_LU_SEND_KEY1: - cdinfo(CD_DVD, "entering DVD_LU_SEND_KEY1\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_KEY1\n"); setup_report_key(&cgc, ai->lsk.agid, 2); if ((ret = cdo->generic_packet(cdi, &cgc))) @@ -1655,7 +1674,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) break; case DVD_LU_SEND_CHALLENGE: - cdinfo(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n"); setup_report_key(&cgc, ai->lsc.agid, 1); if ((ret = cdo->generic_packet(cdi, &cgc))) @@ -1667,7 +1686,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) /* Post-auth key */ case DVD_LU_SEND_TITLE_KEY: - cdinfo(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n"); cgc.quiet = 1; setup_report_key(&cgc, ai->lstk.agid, 4); cgc.cmd[5] = ai->lstk.lba; @@ -1686,7 +1705,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) break; case DVD_LU_SEND_ASF: - cdinfo(CD_DVD, "entering DVD_LU_SEND_ASF\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_ASF\n"); setup_report_key(&cgc, ai->lsasf.agid, 5); if ((ret = cdo->generic_packet(cdi, &cgc))) @@ -1697,7 +1716,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) /* LU data receive (LU changes state) */ case DVD_HOST_SEND_CHALLENGE: - cdinfo(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n"); + cd_dbg(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n"); setup_send_key(&cgc, ai->hsc.agid, 1); buf[1] = 0xe; copy_chal(&buf[4], ai->hsc.chal); @@ -1709,7 +1728,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) break; case DVD_HOST_SEND_KEY2: - cdinfo(CD_DVD, "entering DVD_HOST_SEND_KEY2\n"); + cd_dbg(CD_DVD, "entering DVD_HOST_SEND_KEY2\n"); setup_send_key(&cgc, ai->hsk.agid, 3); buf[1] = 0xa; copy_key(&buf[4], ai->hsk.key); @@ -1724,7 +1743,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) /* Misc */ case DVD_INVALIDATE_AGID: cgc.quiet = 1; - cdinfo(CD_DVD, "entering DVD_INVALIDATE_AGID\n"); + cd_dbg(CD_DVD, "entering DVD_INVALIDATE_AGID\n"); setup_report_key(&cgc, ai->lsa.agid, 0x3f); if ((ret = cdo->generic_packet(cdi, &cgc))) return ret; @@ -1732,7 +1751,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) /* Get region settings */ case DVD_LU_SEND_RPC_STATE: - cdinfo(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n"); + cd_dbg(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n"); setup_report_key(&cgc, 0, 8); memset(&rpc_state, 0, sizeof(rpc_state_t)); cgc.buffer = (char *) &rpc_state; @@ -1749,7 +1768,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) /* Set region settings */ case DVD_HOST_SEND_RPC_STATE: - cdinfo(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n"); + cd_dbg(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n"); setup_send_key(&cgc, 0, 6); buf[1] = 6; buf[4] = ai->hrpcs.pdrc; @@ -1759,7 +1778,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai) break; default: - cdinfo(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type); + cd_dbg(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type); return -ENOTTY; } @@ -1891,7 +1910,8 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s, s->bca.len = buf[0] << 8 | buf[1]; if (s->bca.len < 12 || s->bca.len > 188) { - cdinfo(CD_WARNING, "Received invalid BCA length (%d)\n", s->bca.len); + cd_dbg(CD_WARNING, "Received invalid BCA length (%d)\n", + s->bca.len); ret = -EIO; goto out; } @@ -1927,14 +1947,13 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s, s->manufact.len = buf[0] << 8 | buf[1]; if (s->manufact.len < 0) { - cdinfo(CD_WARNING, "Received invalid manufacture info length" - " (%d)\n", s->manufact.len); + cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d)\n", + s->manufact.len); ret = -EIO; } else { if (s->manufact.len > 2048) { - cdinfo(CD_WARNING, "Received invalid manufacture info " - "length (%d): truncating to 2048\n", - s->manufact.len); + cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d): truncating to 2048\n", + s->manufact.len); s->manufact.len = 2048; } memcpy(s->manufact.value, &buf[4], s->manufact.len); @@ -1965,8 +1984,8 @@ static int dvd_read_struct(struct cdrom_device_info *cdi, dvd_struct *s, return dvd_read_manufact(cdi, s, cgc); default: - cdinfo(CD_WARNING, ": Invalid DVD structure read requested (%d)\n", - s->type); + cd_dbg(CD_WARNING, ": Invalid DVD structure read requested (%d)\n", + s->type); return -EINVAL; } } @@ -2255,7 +2274,7 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, u8 requested_format; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); if (!(cdi->ops->capability & CDC_MULTI_SESSION)) return -ENOSYS; @@ -2277,13 +2296,13 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, if (copy_to_user(argp, &ms_info, sizeof(ms_info))) return -EFAULT; - cdinfo(CD_DO_IOCTL, "CDROMMULTISESSION successful\n"); + cd_dbg(CD_DO_IOCTL, "CDROMMULTISESSION successful\n"); return 0; } static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) { - cdinfo(CD_DO_IOCTL, "entering CDROMEJECT\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT\n"); if (!CDROM_CAN(CDC_OPEN_TRAY)) return -ENOSYS; @@ -2300,7 +2319,7 @@ static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi) { - cdinfo(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n"); if (!CDROM_CAN(CDC_CLOSE_TRAY)) return -ENOSYS; @@ -2310,7 +2329,7 @@ static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi) static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROMEJECT_SW\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT_SW\n"); if (!CDROM_CAN(CDC_OPEN_TRAY)) return -ENOSYS; @@ -2329,7 +2348,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi, struct cdrom_changer_info *info; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n"); if (!CDROM_CAN(CDC_MEDIA_CHANGED)) return -ENOSYS; @@ -2355,7 +2374,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi, static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n"); /* * Options need to be in sync with capability. @@ -2383,7 +2402,7 @@ static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi, static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n"); cdi->options &= ~(int) arg; return cdi->options; @@ -2392,7 +2411,7 @@ static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi, static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n"); if (!CDROM_CAN(CDC_SELECT_SPEED)) return -ENOSYS; @@ -2402,7 +2421,7 @@ static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n"); if (!CDROM_CAN(CDC_SELECT_DISC)) return -ENOSYS; @@ -2420,14 +2439,14 @@ static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi, if (cdi->ops->select_disc) return cdi->ops->select_disc(cdi, arg); - cdinfo(CD_CHANGER, "Using generic cdrom_select_disc()\n"); + cd_dbg(CD_CHANGER, "Using generic cdrom_select_disc()\n"); return cdrom_select_disc(cdi, arg); } static int cdrom_ioctl_reset(struct cdrom_device_info *cdi, struct block_device *bdev) { - cdinfo(CD_DO_IOCTL, "entering CDROM_RESET\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_RESET\n"); if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -2440,7 +2459,7 @@ static int cdrom_ioctl_reset(struct cdrom_device_info *cdi, static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "%socking door.\n", arg ? "L" : "Unl"); + cd_dbg(CD_DO_IOCTL, "%socking door\n", arg ? "L" : "Unl"); if (!CDROM_CAN(CDC_LOCK)) return -EDRIVE_CANT_DO_THIS; @@ -2459,7 +2478,7 @@ static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi, static int cdrom_ioctl_debug(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "%sabling debug.\n", arg ? "En" : "Dis"); + cd_dbg(CD_DO_IOCTL, "%sabling debug\n", arg ? "En" : "Dis"); if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -2469,7 +2488,7 @@ static int cdrom_ioctl_debug(struct cdrom_device_info *cdi, static int cdrom_ioctl_get_capability(struct cdrom_device_info *cdi) { - cdinfo(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n"); return (cdi->ops->capability & ~cdi->mask); } @@ -2485,7 +2504,7 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn mcn; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROM_GET_MCN\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_MCN\n"); if (!(cdi->ops->capability & CDC_MCN)) return -ENOSYS; @@ -2495,14 +2514,14 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi, if (copy_to_user(argp, &mcn, sizeof(mcn))) return -EFAULT; - cdinfo(CD_DO_IOCTL, "CDROM_GET_MCN successful\n"); + cd_dbg(CD_DO_IOCTL, "CDROM_GET_MCN successful\n"); return 0; } static int cdrom_ioctl_drive_status(struct cdrom_device_info *cdi, unsigned long arg) { - cdinfo(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n"); if (!(cdi->ops->capability & CDC_DRIVE_STATUS)) return -ENOSYS; @@ -2535,7 +2554,7 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi) { tracktype tracks; - cdinfo(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n"); cdrom_count_tracks(cdi, &tracks); if (tracks.error) @@ -2557,13 +2576,13 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi) return CDS_DATA_1; /* Policy mode off */ - cdinfo(CD_WARNING,"This disc doesn't have any tracks I recognize!\n"); + cd_dbg(CD_WARNING, "This disc doesn't have any tracks I recognize!\n"); return CDS_NO_INFO; } static int cdrom_ioctl_changer_nslots(struct cdrom_device_info *cdi) { - cdinfo(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n"); return cdi->capacity; } @@ -2574,7 +2593,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi, u8 requested, back; int ret; - /* cdinfo(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/ + /* cd_dbg(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/ if (copy_from_user(&q, argp, sizeof(q))) return -EFAULT; @@ -2594,7 +2613,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi, if (copy_to_user(argp, &q, sizeof(q))) return -EFAULT; - /* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ + /* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ return 0; } @@ -2604,7 +2623,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi, struct cdrom_tochdr header; int ret; - /* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */ + /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */ if (copy_from_user(&header, argp, sizeof(header))) return -EFAULT; @@ -2615,7 +2634,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi, if (copy_to_user(argp, &header, sizeof(header))) return -EFAULT; - /* cdinfo(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */ + /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */ return 0; } @@ -2626,7 +2645,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi, u8 requested_format; int ret; - /* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */ + /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */ if (copy_from_user(&entry, argp, sizeof(entry))) return -EFAULT; @@ -2643,7 +2662,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi, if (copy_to_user(argp, &entry, sizeof(entry))) return -EFAULT; - /* cdinfo(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */ + /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */ return 0; } @@ -2652,7 +2671,7 @@ static int cdrom_ioctl_play_msf(struct cdrom_device_info *cdi, { struct cdrom_msf msf; - cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); if (!CDROM_CAN(CDC_PLAY_AUDIO)) return -ENOSYS; @@ -2667,7 +2686,7 @@ static int cdrom_ioctl_play_trkind(struct cdrom_device_info *cdi, struct cdrom_ti ti; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n"); if (!CDROM_CAN(CDC_PLAY_AUDIO)) return -ENOSYS; @@ -2684,7 +2703,7 @@ static int cdrom_ioctl_volctrl(struct cdrom_device_info *cdi, { struct cdrom_volctrl volume; - cdinfo(CD_DO_IOCTL, "entering CDROMVOLCTRL\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMVOLCTRL\n"); if (!CDROM_CAN(CDC_PLAY_AUDIO)) return -ENOSYS; @@ -2699,7 +2718,7 @@ static int cdrom_ioctl_volread(struct cdrom_device_info *cdi, struct cdrom_volctrl volume; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROMVOLREAD\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMVOLREAD\n"); if (!CDROM_CAN(CDC_PLAY_AUDIO)) return -ENOSYS; @@ -2718,7 +2737,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi, { int ret; - cdinfo(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n"); + cd_dbg(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n"); if (!CDROM_CAN(CDC_PLAY_AUDIO)) return -ENOSYS; @@ -2729,103 +2748,6 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi, } /* - * Just about every imaginable ioctl is supported in the Uniform layer - * these days. - * ATAPI / SCSI specific code now mainly resides in mmc_ioctl(). - */ -int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, - fmode_t mode, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int ret; - - /* - * Try the generic SCSI command ioctl's first. - */ - ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp); - if (ret != -ENOTTY) - return ret; - - switch (cmd) { - case CDROMMULTISESSION: - return cdrom_ioctl_multisession(cdi, argp); - case CDROMEJECT: - return cdrom_ioctl_eject(cdi); - case CDROMCLOSETRAY: - return cdrom_ioctl_closetray(cdi); - case CDROMEJECT_SW: - return cdrom_ioctl_eject_sw(cdi, arg); - case CDROM_MEDIA_CHANGED: - return cdrom_ioctl_media_changed(cdi, arg); - case CDROM_SET_OPTIONS: - return cdrom_ioctl_set_options(cdi, arg); - case CDROM_CLEAR_OPTIONS: - return cdrom_ioctl_clear_options(cdi, arg); - case CDROM_SELECT_SPEED: - return cdrom_ioctl_select_speed(cdi, arg); - case CDROM_SELECT_DISC: - return cdrom_ioctl_select_disc(cdi, arg); - case CDROMRESET: - return cdrom_ioctl_reset(cdi, bdev); - case CDROM_LOCKDOOR: - return cdrom_ioctl_lock_door(cdi, arg); - case CDROM_DEBUG: - return cdrom_ioctl_debug(cdi, arg); - case CDROM_GET_CAPABILITY: - return cdrom_ioctl_get_capability(cdi); - case CDROM_GET_MCN: - return cdrom_ioctl_get_mcn(cdi, argp); - case CDROM_DRIVE_STATUS: - return cdrom_ioctl_drive_status(cdi, arg); - case CDROM_DISC_STATUS: - return cdrom_ioctl_disc_status(cdi); - case CDROM_CHANGER_NSLOTS: - return cdrom_ioctl_changer_nslots(cdi); - } - - /* - * Use the ioctls that are implemented through the generic_packet() - * interface. this may look at bit funny, but if -ENOTTY is - * returned that particular ioctl is not implemented and we - * let it go through the device specific ones. - */ - if (CDROM_CAN(CDC_GENERIC_PACKET)) { - ret = mmc_ioctl(cdi, cmd, arg); - if (ret != -ENOTTY) - return ret; - } - - /* - * Note: most of the cdinfo() calls are commented out here, - * because they fill up the sys log when CD players poll - * the drive. - */ - switch (cmd) { - case CDROMSUBCHNL: - return cdrom_ioctl_get_subchnl(cdi, argp); - case CDROMREADTOCHDR: - return cdrom_ioctl_read_tochdr(cdi, argp); - case CDROMREADTOCENTRY: - return cdrom_ioctl_read_tocentry(cdi, argp); - case CDROMPLAYMSF: - return cdrom_ioctl_play_msf(cdi, argp); - case CDROMPLAYTRKIND: - return cdrom_ioctl_play_trkind(cdi, argp); - case CDROMVOLCTRL: - return cdrom_ioctl_volctrl(cdi, argp); - case CDROMVOLREAD: - return cdrom_ioctl_volread(cdi, argp); - case CDROMSTART: - case CDROMSTOP: - case CDROMPAUSE: - case CDROMRESUME: - return cdrom_ioctl_audioctl(cdi, cmd); - } - - return -ENOSYS; -} - -/* * Required when we need to use READ_10 to issue other than 2048 block * reads */ @@ -2854,10 +2776,158 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size) return cdo->generic_packet(cdi, &cgc); } +static int cdrom_get_track_info(struct cdrom_device_info *cdi, + __u16 track, __u8 type, track_information *ti) +{ + struct cdrom_device_ops *cdo = cdi->ops; + struct packet_command cgc; + int ret, buflen; + + init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; + cgc.cmd[1] = type & 3; + cgc.cmd[4] = (track & 0xff00) >> 8; + cgc.cmd[5] = track & 0xff; + cgc.cmd[8] = 8; + cgc.quiet = 1; + + ret = cdo->generic_packet(cdi, &cgc); + if (ret) + return ret; + + buflen = be16_to_cpu(ti->track_information_length) + + sizeof(ti->track_information_length); + + if (buflen > sizeof(track_information)) + buflen = sizeof(track_information); + + cgc.cmd[8] = cgc.buflen = buflen; + ret = cdo->generic_packet(cdi, &cgc); + if (ret) + return ret; + + /* return actual fill size */ + return buflen; +} + +/* return the last written block on the CD-R media. this is for the udf + file system. */ +int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written) +{ + struct cdrom_tocentry toc; + disc_information di; + track_information ti; + __u32 last_track; + int ret = -1, ti_size; + + if (!CDROM_CAN(CDC_GENERIC_PACKET)) + goto use_toc; + + ret = cdrom_get_disc_info(cdi, &di); + if (ret < (int)(offsetof(typeof(di), last_track_lsb) + + sizeof(di.last_track_lsb))) + goto use_toc; + + /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */ + last_track = (di.last_track_msb << 8) | di.last_track_lsb; + ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); + if (ti_size < (int)offsetof(typeof(ti), track_start)) + goto use_toc; + + /* if this track is blank, try the previous. */ + if (ti.blank) { + if (last_track == 1) + goto use_toc; + last_track--; + ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); + } + + if (ti_size < (int)(offsetof(typeof(ti), track_size) + + sizeof(ti.track_size))) + goto use_toc; + + /* if last recorded field is valid, return it. */ + if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address) + + sizeof(ti.last_rec_address))) { + *last_written = be32_to_cpu(ti.last_rec_address); + } else { + /* make it up instead */ + *last_written = be32_to_cpu(ti.track_start) + + be32_to_cpu(ti.track_size); + if (ti.free_blocks) + *last_written -= (be32_to_cpu(ti.free_blocks) + 7); + } + return 0; + + /* this is where we end up if the drive either can't do a + GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if + it doesn't give enough information or fails. then we return + the toc contents. */ +use_toc: + toc.cdte_format = CDROM_MSF; + toc.cdte_track = CDROM_LEADOUT; + if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc))) + return ret; + sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA); + *last_written = toc.cdte_addr.lba; + return 0; +} + +/* return the next writable block. also for udf file system. */ +static int cdrom_get_next_writable(struct cdrom_device_info *cdi, + long *next_writable) +{ + disc_information di; + track_information ti; + __u16 last_track; + int ret, ti_size; + + if (!CDROM_CAN(CDC_GENERIC_PACKET)) + goto use_last_written; + + ret = cdrom_get_disc_info(cdi, &di); + if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb) + + sizeof(di.last_track_lsb)) + goto use_last_written; + + /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */ + last_track = (di.last_track_msb << 8) | di.last_track_lsb; + ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); + if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start)) + goto use_last_written; + + /* if this track is blank, try the previous. */ + if (ti.blank) { + if (last_track == 1) + goto use_last_written; + last_track--; + ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); + if (ti_size < 0) + goto use_last_written; + } + + /* if next recordable address field is valid, use it. */ + if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable) + + sizeof(ti.next_writable)) { + *next_writable = be32_to_cpu(ti.next_writable); + return 0; + } + +use_last_written: + ret = cdrom_get_last_written(cdi, next_writable); + if (ret) { + *next_writable = 0; + return ret; + } else { + *next_writable += 7; + return 0; + } +} + static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, - void __user *arg, - struct packet_command *cgc, - int cmd) + void __user *arg, + struct packet_command *cgc, + int cmd) { struct request_sense sense; struct cdrom_msf msf; @@ -2876,7 +2946,8 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, blocksize = CD_FRAMESIZE_RAW0; break; } - IOCTL_IN(arg, struct cdrom_msf, msf); + if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf))) + return -EFAULT; lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0); /* FIXME: we need upper bound checking, too!! */ if (lba < 0) @@ -2891,8 +2962,8 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, cgc->data_direction = CGC_DATA_READ; ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); if (ret && sense.sense_key == 0x05 && - sense.asc == 0x20 && - sense.ascq == 0x00) { + sense.asc == 0x20 && + sense.ascq == 0x00) { /* * SCSI-II devices are not required to support * READ_CD, so let's try switching block size @@ -2913,12 +2984,14 @@ out: } static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi, - void __user *arg) + void __user *arg) { struct cdrom_read_audio ra; int lba; - IOCTL_IN(arg, struct cdrom_read_audio, ra); + if (copy_from_user(&ra, (struct cdrom_read_audio __user *)arg, + sizeof(ra))) + return -EFAULT; if (ra.addr_format == CDROM_MSF) lba = msf_to_lba(ra.addr.msf.minute, @@ -2937,12 +3010,13 @@ static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi, - void __user *arg) + void __user *arg) { int ret; struct cdrom_subchnl q; u_char requested, back; - IOCTL_IN(arg, struct cdrom_subchnl, q); + if (copy_from_user(&q, (struct cdrom_subchnl __user *)arg, sizeof(q))) + return -EFAULT; requested = q.cdsc_format; if (!((requested == CDROM_MSF) || (requested == CDROM_LBA))) @@ -2954,19 +3028,21 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi, back = q.cdsc_format; /* local copy */ sanitize_format(&q.cdsc_absaddr, &back, requested); sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested); - IOCTL_OUT(arg, struct cdrom_subchnl, q); - /* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ + if (copy_to_user((struct cdrom_subchnl __user *)arg, &q, sizeof(q))) + return -EFAULT; + /* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ return 0; } static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi, - void __user *arg, - struct packet_command *cgc) + void __user *arg, + struct packet_command *cgc) { struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_msf msf; - cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); - IOCTL_IN(arg, struct cdrom_msf, msf); + cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); + if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf))) + return -EFAULT; cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF; cgc->cmd[3] = msf.cdmsf_min0; cgc->cmd[4] = msf.cdmsf_sec0; @@ -2979,13 +3055,14 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi, - void __user *arg, - struct packet_command *cgc) + void __user *arg, + struct packet_command *cgc) { struct cdrom_device_ops *cdo = cdi->ops; struct cdrom_blk blk; - cdinfo(CD_DO_IOCTL, "entering CDROMPLAYBLK\n"); - IOCTL_IN(arg, struct cdrom_blk, blk); + cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n"); + if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk))) + return -EFAULT; cgc->cmd[0] = GPCMD_PLAY_AUDIO_10; cgc->cmd[2] = (blk.from >> 24) & 0xff; cgc->cmd[3] = (blk.from >> 16) & 0xff; @@ -2998,9 +3075,9 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, - void __user *arg, - struct packet_command *cgc, - unsigned int cmd) + void __user *arg, + struct packet_command *cgc, + unsigned int cmd) { struct cdrom_volctrl volctrl; unsigned char buffer[32]; @@ -3008,9 +3085,11 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, unsigned short offset; int ret; - cdinfo(CD_DO_IOCTL, "entering CDROMVOLUME\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n"); - IOCTL_IN(arg, struct cdrom_volctrl, volctrl); + if (copy_from_user(&volctrl, (struct cdrom_volctrl __user *)arg, + sizeof(volctrl))) + return -EFAULT; cgc->buffer = buffer; cgc->buflen = 24; @@ -3030,14 +3109,14 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, if (offset + 16 > cgc->buflen) { cgc->buflen = offset + 16; ret = cdrom_mode_sense(cdi, cgc, - GPMODE_AUDIO_CTL_PAGE, 0); + GPMODE_AUDIO_CTL_PAGE, 0); if (ret) return ret; } /* sanity check */ if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE || - buffer[offset + 1] < 14) + buffer[offset + 1] < 14) return -EINVAL; /* now we have the current volume settings. if it was only @@ -3047,7 +3126,9 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, volctrl.channel1 = buffer[offset+11]; volctrl.channel2 = buffer[offset+13]; volctrl.channel3 = buffer[offset+15]; - IOCTL_OUT(arg, struct cdrom_volctrl, volctrl); + if (copy_to_user((struct cdrom_volctrl __user *)arg, &volctrl, + sizeof(volctrl))) + return -EFAULT; return 0; } @@ -3069,11 +3150,11 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi, - struct packet_command *cgc, - int cmd) + struct packet_command *cgc, + int cmd) { struct cdrom_device_ops *cdo = cdi->ops; - cdinfo(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); cgc->cmd[0] = GPCMD_START_STOP_UNIT; cgc->cmd[1] = 1; cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0; @@ -3082,11 +3163,11 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi, - struct packet_command *cgc, - int cmd) + struct packet_command *cgc, + int cmd) { struct cdrom_device_ops *cdo = cdi->ops; - cdinfo(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); cgc->cmd[0] = GPCMD_PAUSE_RESUME; cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0; cgc->data_direction = CGC_DATA_NONE; @@ -3094,8 +3175,8 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi, } static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi, - void __user *arg, - struct packet_command *cgc) + void __user *arg, + struct packet_command *cgc) { int ret; dvd_struct *s; @@ -3108,7 +3189,7 @@ static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi, if (!s) return -ENOMEM; - cdinfo(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n"); + cd_dbg(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n"); if (copy_from_user(s, arg, size)) { kfree(s); return -EFAULT; @@ -3126,44 +3207,48 @@ out: } static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi, - void __user *arg) + void __user *arg) { int ret; dvd_authinfo ai; if (!CDROM_CAN(CDC_DVD)) return -ENOSYS; - cdinfo(CD_DO_IOCTL, "entering DVD_AUTH\n"); - IOCTL_IN(arg, dvd_authinfo, ai); + cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n"); + if (copy_from_user(&ai, (dvd_authinfo __user *)arg, sizeof(ai))) + return -EFAULT; ret = dvd_do_auth(cdi, &ai); if (ret) return ret; - IOCTL_OUT(arg, dvd_authinfo, ai); + if (copy_to_user((dvd_authinfo __user *)arg, &ai, sizeof(ai))) + return -EFAULT; return 0; } static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi, - void __user *arg) + void __user *arg) { int ret; long next = 0; - cdinfo(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n"); ret = cdrom_get_next_writable(cdi, &next); if (ret) return ret; - IOCTL_OUT(arg, long, next); + if (copy_to_user((long __user *)arg, &next, sizeof(next))) + return -EFAULT; return 0; } static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi, - void __user *arg) + void __user *arg) { int ret; long last = 0; - cdinfo(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n"); + cd_dbg(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n"); ret = cdrom_get_last_written(cdi, &last); if (ret) return ret; - IOCTL_OUT(arg, long, last); + if (copy_to_user((long __user *)arg, &last, sizeof(last))) + return -EFAULT; return 0; } @@ -3212,181 +3297,101 @@ static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, return -ENOTTY; } -static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type, - track_information *ti) -{ - struct cdrom_device_ops *cdo = cdi->ops; - struct packet_command cgc; - int ret, buflen; - - init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; - cgc.cmd[1] = type & 3; - cgc.cmd[4] = (track & 0xff00) >> 8; - cgc.cmd[5] = track & 0xff; - cgc.cmd[8] = 8; - cgc.quiet = 1; - - if ((ret = cdo->generic_packet(cdi, &cgc))) - return ret; - - buflen = be16_to_cpu(ti->track_information_length) + - sizeof(ti->track_information_length); - - if (buflen > sizeof(track_information)) - buflen = sizeof(track_information); - - cgc.cmd[8] = cgc.buflen = buflen; - if ((ret = cdo->generic_packet(cdi, &cgc))) - return ret; - - /* return actual fill size */ - return buflen; -} - -/* requires CD R/RW */ -static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di) +/* + * Just about every imaginable ioctl is supported in the Uniform layer + * these days. + * ATAPI / SCSI specific code now mainly resides in mmc_ioctl(). + */ +int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, + fmode_t mode, unsigned int cmd, unsigned long arg) { - struct cdrom_device_ops *cdo = cdi->ops; - struct packet_command cgc; - int ret, buflen; - - /* set up command and get the disc info */ - init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_DISC_INFO; - cgc.cmd[8] = cgc.buflen = 2; - cgc.quiet = 1; - - if ((ret = cdo->generic_packet(cdi, &cgc))) - return ret; + void __user *argp = (void __user *)arg; + int ret; - /* not all drives have the same disc_info length, so requeue - * packet with the length the drive tells us it can supply + /* + * Try the generic SCSI command ioctl's first. */ - buflen = be16_to_cpu(di->disc_information_length) + - sizeof(di->disc_information_length); - - if (buflen > sizeof(disc_information)) - buflen = sizeof(disc_information); - - cgc.cmd[8] = cgc.buflen = buflen; - if ((ret = cdo->generic_packet(cdi, &cgc))) + ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp); + if (ret != -ENOTTY) return ret; - /* return actual fill size */ - return buflen; -} - -/* return the last written block on the CD-R media. this is for the udf - file system. */ -int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written) -{ - struct cdrom_tocentry toc; - disc_information di; - track_information ti; - __u32 last_track; - int ret = -1, ti_size; - - if (!CDROM_CAN(CDC_GENERIC_PACKET)) - goto use_toc; - - ret = cdrom_get_disc_info(cdi, &di); - if (ret < (int)(offsetof(typeof(di), last_track_lsb) - + sizeof(di.last_track_lsb))) - goto use_toc; - - /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */ - last_track = (di.last_track_msb << 8) | di.last_track_lsb; - ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); - if (ti_size < (int)offsetof(typeof(ti), track_start)) - goto use_toc; - - /* if this track is blank, try the previous. */ - if (ti.blank) { - if (last_track==1) - goto use_toc; - last_track--; - ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); - } - - if (ti_size < (int)(offsetof(typeof(ti), track_size) - + sizeof(ti.track_size))) - goto use_toc; - - /* if last recorded field is valid, return it. */ - if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address) - + sizeof(ti.last_rec_address))) { - *last_written = be32_to_cpu(ti.last_rec_address); - } else { - /* make it up instead */ - *last_written = be32_to_cpu(ti.track_start) + - be32_to_cpu(ti.track_size); - if (ti.free_blocks) - *last_written -= (be32_to_cpu(ti.free_blocks) + 7); + switch (cmd) { + case CDROMMULTISESSION: + return cdrom_ioctl_multisession(cdi, argp); + case CDROMEJECT: + return cdrom_ioctl_eject(cdi); + case CDROMCLOSETRAY: + return cdrom_ioctl_closetray(cdi); + case CDROMEJECT_SW: + return cdrom_ioctl_eject_sw(cdi, arg); + case CDROM_MEDIA_CHANGED: + return cdrom_ioctl_media_changed(cdi, arg); + case CDROM_SET_OPTIONS: + return cdrom_ioctl_set_options(cdi, arg); + case CDROM_CLEAR_OPTIONS: + return cdrom_ioctl_clear_options(cdi, arg); + case CDROM_SELECT_SPEED: + return cdrom_ioctl_select_speed(cdi, arg); + case CDROM_SELECT_DISC: + return cdrom_ioctl_select_disc(cdi, arg); + case CDROMRESET: + return cdrom_ioctl_reset(cdi, bdev); + case CDROM_LOCKDOOR: + return cdrom_ioctl_lock_door(cdi, arg); + case CDROM_DEBUG: + return cdrom_ioctl_debug(cdi, arg); + case CDROM_GET_CAPABILITY: + return cdrom_ioctl_get_capability(cdi); + case CDROM_GET_MCN: + return cdrom_ioctl_get_mcn(cdi, argp); + case CDROM_DRIVE_STATUS: + return cdrom_ioctl_drive_status(cdi, arg); + case CDROM_DISC_STATUS: + return cdrom_ioctl_disc_status(cdi); + case CDROM_CHANGER_NSLOTS: + return cdrom_ioctl_changer_nslots(cdi); } - return 0; - /* this is where we end up if the drive either can't do a - GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if - it doesn't give enough information or fails. then we return - the toc contents. */ -use_toc: - toc.cdte_format = CDROM_MSF; - toc.cdte_track = CDROM_LEADOUT; - if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc))) - return ret; - sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA); - *last_written = toc.cdte_addr.lba; - return 0; -} - -/* return the next writable block. also for udf file system. */ -static int cdrom_get_next_writable(struct cdrom_device_info *cdi, long *next_writable) -{ - disc_information di; - track_information ti; - __u16 last_track; - int ret, ti_size; - - if (!CDROM_CAN(CDC_GENERIC_PACKET)) - goto use_last_written; - - ret = cdrom_get_disc_info(cdi, &di); - if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb) - + sizeof(di.last_track_lsb)) - goto use_last_written; - - /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */ - last_track = (di.last_track_msb << 8) | di.last_track_lsb; - ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); - if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start)) - goto use_last_written; - - /* if this track is blank, try the previous. */ - if (ti.blank) { - if (last_track == 1) - goto use_last_written; - last_track--; - ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); - if (ti_size < 0) - goto use_last_written; + /* + * Use the ioctls that are implemented through the generic_packet() + * interface. this may look at bit funny, but if -ENOTTY is + * returned that particular ioctl is not implemented and we + * let it go through the device specific ones. + */ + if (CDROM_CAN(CDC_GENERIC_PACKET)) { + ret = mmc_ioctl(cdi, cmd, arg); + if (ret != -ENOTTY) + return ret; } - /* if next recordable address field is valid, use it. */ - if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable) - + sizeof(ti.next_writable)) { - *next_writable = be32_to_cpu(ti.next_writable); - return 0; + /* + * Note: most of the cd_dbg() calls are commented out here, + * because they fill up the sys log when CD players poll + * the drive. + */ + switch (cmd) { + case CDROMSUBCHNL: + return cdrom_ioctl_get_subchnl(cdi, argp); + case CDROMREADTOCHDR: + return cdrom_ioctl_read_tochdr(cdi, argp); + case CDROMREADTOCENTRY: + return cdrom_ioctl_read_tocentry(cdi, argp); + case CDROMPLAYMSF: + return cdrom_ioctl_play_msf(cdi, argp); + case CDROMPLAYTRKIND: + return cdrom_ioctl_play_trkind(cdi, argp); + case CDROMVOLCTRL: + return cdrom_ioctl_volctrl(cdi, argp); + case CDROMVOLREAD: + return cdrom_ioctl_volread(cdi, argp); + case CDROMSTART: + case CDROMSTOP: + case CDROMPAUSE: + case CDROMRESUME: + return cdrom_ioctl_audioctl(cdi, cmd); } -use_last_written: - if ((ret = cdrom_get_last_written(cdi, next_writable))) { - *next_writable = 0; - return ret; - } else { - *next_writable += 7; - return 0; - } + return -ENOSYS; } EXPORT_SYMBOL(cdrom_get_last_written); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 51e75ad96422..584bc3126403 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work) spin_unlock(&gdrom_lock); block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; block_cnt = blk_rq_sectors(req)/GD_TO_BLK; - __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG); + __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG); __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); __raw_writel(1, GDROM_DMA_DIRECTION_REG); __raw_writel(1, GDROM_DMA_ENABLE_REG); diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b75713d953a..0a19d866a153 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk) add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); } +EXPORT_SYMBOL_GPL(add_disk_randomness); #endif /********************************************************************* diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 16f69be820c7..ee880382e3bc 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq, ledtrig_ide_activity(); - pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n", + pr_debug("%s: %sing: block=%llu, sectors=%u\n", drive->name, rq_data_dir(rq) == READ ? "read" : "writ", - (unsigned long long)block, blk_rq_sectors(rq), - (unsigned long)rq->buffer); + (unsigned long long)block, blk_rq_sectors(rq)); if (hwif->rw_disk) hwif->rw_disk(drive, rq); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e64916498..6a71bc7c9133 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq, clone->cmd = rq->cmd; clone->cmd_len = rq->cmd_len; clone->sense = rq->sense; - clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 0b2ccb68c0d0..4dbfaee9aa95 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, block = blk_rq_pos(req) << 9 >> tr->blkshift; nsect = blk_rq_cur_bytes(req) >> tr->blkshift; - - buf = req->buffer; + buf = bio_data(req->bio); if (req->cmd_type != REQ_TYPE_FS) return -EIO; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 7ff473c871a9..ee774ba3728d 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req) * flash access anyway. */ mutex_lock(&dev->dev_mutex); - ret = ubiblock_read(dev, req->buffer, sec, len); + ret = ubiblock_read(dev, bio_data(req->bio), sec, len); mutex_unlock(&dev->dev_mutex); return ret; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 4ccb5d869389..a40ee1e37486 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q) goto end; } - jsfd_read(req->buffer, jdp->dbase + offset, len); + jsfd_read(bio_data(req->bio), jdp->dbase + offset, len); err = 0; end: if (!__blk_end_request_cur(req, err)) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 65a123d9c676..3cc82d3dec78 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -139,7 +139,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy) */ spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, cmd->request); - kblockd_schedule_work(q, &device->requeue_work); + kblockd_schedule_work(&device->requeue_work); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -1018,8 +1018,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, return BLKPREP_DEFER; } - req->buffer = NULL; - /* * Next, walk the list, and fill in the addresses and sizes of * each segment. @@ -1156,7 +1154,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); - req->buffer = NULL; } cmd->cmd_len = req->cmd_len; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index efcbcd182863..96af195224f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) goto out; } + rq->completion_data = page; blk_add_request_payload(rq, page, len); ret = scsi_setup_blk_pc_cmnd(sdp, rq); - rq->buffer = page_address(page); rq->__data_len = nr_bytes; out: - if (ret != BLKPREP_OK) { + if (ret != BLKPREP_OK) __free_page(page); - rq->buffer = NULL; - } return ret; } @@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq) { struct scsi_cmnd *SCpnt = rq->special; - if (rq->cmd_flags & REQ_DISCARD) { - free_page((unsigned long)rq->buffer); - rq->buffer = NULL; - } + if (rq->cmd_flags & REQ_DISCARD) + __free_page(rq->completion_data); + if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); SCpnt->cmnd = NULL; diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o diff --git a/include/linux/bio.h b/include/linux/bio.h index bba550826921..5a645769f020 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors, extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); -extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); +extern mempool_t *biovec_create_pool(int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0120451545d8..91dfb75ce39f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,7 +8,13 @@ struct blk_mq_tags; struct blk_mq_cpu_notifier { struct list_head list; void *data; - void (*notify)(void *data, unsigned long action, unsigned int cpu); + int (*notify)(void *data, unsigned long action, unsigned int cpu); +}; + +struct blk_mq_ctxmap { + unsigned int map_size; + unsigned int bits_per_word; + struct blk_align_bitmap *map; }; struct blk_mq_hw_ctx { @@ -18,7 +24,11 @@ struct blk_mq_hw_ctx { } ____cacheline_aligned_in_smp; unsigned long state; /* BLK_MQ_S_* flags */ - struct delayed_work delayed_work; + struct delayed_work run_work; + struct delayed_work delay_work; + cpumask_var_t cpumask; + int next_cpu; + int next_cpu_batch; unsigned long flags; /* BLK_MQ_F_* flags */ @@ -27,13 +37,13 @@ struct blk_mq_hw_ctx { void *driver_data; + struct blk_mq_ctxmap ctx_map; + unsigned int nr_ctx; struct blk_mq_ctx **ctxs; - unsigned int nr_ctx_map; - unsigned long *ctx_map; - struct request **rqs; - struct list_head page_list; + unsigned int wait_index; + struct blk_mq_tags *tags; unsigned long queued; @@ -41,31 +51,40 @@ struct blk_mq_hw_ctx { #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; - unsigned int queue_depth; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ + atomic_t nr_active; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; }; -struct blk_mq_reg { +struct blk_mq_tag_set { struct blk_mq_ops *ops; unsigned int nr_hw_queues; - unsigned int queue_depth; + unsigned int queue_depth; /* max hw supported */ unsigned int reserved_tags; unsigned int cmd_size; /* per-request extra data */ int numa_node; unsigned int timeout; unsigned int flags; /* BLK_MQ_F_* */ + void *driver_data; + + struct blk_mq_tags **tags; + + struct mutex tag_list_lock; + struct list_head tag_list; }; typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); -typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int); -typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); +typedef int (init_request_fn)(void *, struct request *, unsigned int, + unsigned int, unsigned int); +typedef void (exit_request_fn)(void *, struct request *, unsigned int, + unsigned int); struct blk_mq_ops { /* @@ -86,18 +105,20 @@ struct blk_mq_ops { softirq_done_fn *complete; /* - * Override for hctx allocations (should probably go) - */ - alloc_hctx_fn *alloc_hctx; - free_hctx_fn *free_hctx; - - /* * Called when the block layer side of a hardware queue has been * set up, allowing the driver to allocate/init matching structures. * Ditto for exit/teardown. */ init_hctx_fn *init_hctx; exit_hctx_fn *exit_hctx; + + /* + * Called for every command allocated by the block layer to allow + * the driver to set up driver specific data. + * Ditto for exit/teardown. + */ + init_request_fn *init_request; + exit_request_fn *exit_request; }; enum { @@ -107,18 +128,22 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_SHOULD_IPI = 1 << 2, + BLK_MQ_F_TAG_SHARED = 1 << 2, BLK_MQ_S_STOPPED = 0, + BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_MAX_DEPTH = 2048, + + BLK_MQ_CPU_WORK_BATCH = 8, }; -struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); -int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); -void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); + +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); +void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); @@ -126,28 +151,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); -struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + gfp_t gfp, bool reserved); +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); -void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); +struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); -bool blk_mq_end_io_partial(struct request *rq, int error, - unsigned int nr_bytes); -static inline void blk_mq_end_io(struct request *rq, int error) -{ - bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq)); - BUG_ON(!done); -} +void blk_mq_end_io(struct request *rq, int error); +void __blk_mq_end_io(struct request *rq, int error); +void blk_mq_requeue_request(struct request *rq); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); +void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -void blk_mq_start_stopped_hw_queues(struct request_queue *q); +void blk_mq_start_hw_queues(struct request_queue *q); +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data); /* * Driver command data is immediately after the request. So subtract request @@ -162,12 +187,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return (void *) rq + sizeof(*rq); } -static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag) -{ - return hctx->rqs[tag]; -} - #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index aa0eaa2d0bd8..d8e4cea23a25 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -190,6 +190,7 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_END, /* last of chain of requests */ __REQ_HASHED, /* on IO scheduler merge hash */ + __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -243,5 +244,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_END (1ULL << __REQ_END) #define REQ_HASHED (1ULL << __REQ_HASHED) +#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0d84981ee03f..e90e1692e052 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -90,15 +90,15 @@ enum rq_cmd_type_bits { #define BLK_MAX_CDB 16 /* - * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() - * as well! + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct list_head queuelist; union { struct call_single_data csd; - struct work_struct mq_flush_work; unsigned long fifo_time; }; @@ -178,7 +178,6 @@ struct request { unsigned short ioprio; void *special; /* opaque pointer available for LLD use */ - char *buffer; /* kaddr of the current segment if available */ int tag; int errors; @@ -463,6 +462,10 @@ struct request_queue { struct request *flush_rq; spinlock_t mq_flush_lock; + struct list_head requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; + struct mutex sysfs_lock; int bypass_depth; @@ -481,6 +484,9 @@ struct request_queue { wait_queue_head_t mq_freeze_wq; struct percpu_counter mq_usage_counter; struct list_head all_q_node; + + struct blk_mq_tag_set *tag_set; + struct list_head tag_set_list; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -613,6 +619,15 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0) +/* + * Driver can handle struct request, if it either has an old style + * request_fn defined, or is blk-mq based. + */ +static inline bool queue_is_rq_based(struct request_queue *q) +{ + return q->request_fn || q->mq_ops; +} + static inline unsigned int blk_queue_cluster(struct request_queue *q) { return q->limits.cluster; @@ -937,6 +952,7 @@ extern struct request *blk_fetch_request(struct request_queue *q); */ extern bool blk_update_request(struct request *rq, int error, unsigned int nr_bytes); +extern void blk_finish_request(struct request *rq, int error); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); @@ -1102,7 +1118,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) /* * tag stuff */ -#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) +#define blk_rq_tagged(rq) \ + ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED)) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); @@ -1370,8 +1387,9 @@ static inline void put_dev_sector(Sector p) } struct work_struct; -int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); -int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); +int kblockd_schedule_work(struct work_struct *work); +int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); +int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..0173940407f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o |