diff options
86 files changed, 2566 insertions, 2008 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index c0a3bb5a6e4e..b7f6bdc96d73 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -192,5 +192,11 @@ scaling back writes. Writing a value of '0' to this file disables the feature. Writing a value of '-1' to this file resets the value to the default setting. +throttle_sample_time (RW) +------------------------- +This is the time window that blk-throttle samples data, in millisecond. +blk-throttle makes decision based on the samplings. Lower time means cgroups +have more smooth throughput, but higher CPU overhead. This exists only when +CONFIG_BLK_DEV_THROTTLING_LOW is enabled. Jens Axboe <[email protected]>, February 2009 diff --git a/block/Kconfig b/block/Kconfig index e9f780f815f5..89cd28f8d051 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config BLK_DEV_THROTTLING_LOW + bool "Block throttling .low limit interface support (EXPERIMENTAL)" + depends on BLK_DEV_THROTTLING + default n + ---help--- + Add .low limit interface for block throttling. The low limit is a best + effort limit to prioritize cgroups. Depending on the setting, the limit + can be used to protect cgroups in terms of bandwidth/iops and better + utilize disk resource. + + Note, this is an experimental interface and could be changed someday. + config BLK_CMDLINE_PARSER bool "Block device command line partition parser" default n diff --git a/block/bio.c b/block/bio.c index e75878f8b14a..f4d207180266 100644 --- a/block/bio.c +++ b/block/bio.c @@ -30,6 +30,7 @@ #include <linux/cgroup.h> #include <trace/events/block.h> +#include "blk.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, + struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; unsigned front_pad; @@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio) * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. + * + * bio_endio() can be called several times on a bio that has been chained + * using bio_chain(). The ->bi_end_io() function will only be called the + * last time. At this point the BLK_TA_COMPLETE tracing event will be + * generated if BIO_TRACE_COMPLETION is set. **/ void bio_endio(struct bio *bio) { @@ -1844,6 +1851,13 @@ again: goto again; } + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), + bio, bio->bi_error); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } + + blk_throtl_bio_endio(bio); if (bio->bi_end_io) bio->bi_end_io(bio); } @@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_advance(bio, split->bi_iter.bi_size); + if (bio_flagged(bio, BIO_TRACE_COMPLETION)) + bio_set_flag(bio, BIO_TRACE_COMPLETION); + return split; } EXPORT_SYMBOL(bio_split); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bbe7ee00bd3d..7c2947128f58 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, } EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); +/* Performs queue bypass and policy enabled checks then looks up blkg. */ +static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, + const struct blkcg_policy *pol, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * This could be the first entry point of blkcg implementation and + * we shouldn't allow anything to go through for a bypassing queue. + */ + if (unlikely(blk_queue_bypass(q))) + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + + return __blkg_lookup(blkcg, q, true /* update_hint */); +} + /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup @@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; + struct request_queue *q; struct blkcg_gq *blkg; struct module *owner; unsigned int major, minor; @@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); - return -ENODEV; + ret = -ENODEV; + goto fail; } - rcu_read_lock(); - spin_lock_irq(disk->queue->queue_lock); + q = disk->queue; - if (blkcg_policy_enabled(disk->queue, pol)) - blkg = blkg_lookup_create(blkcg, disk->queue); - else - blkg = ERR_PTR(-EOPNOTSUPP); + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) + goto success; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent; + struct blkcg_gq *new_blkg; + + parent = blkcg_parent(blkcg); + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + /* Drop locks to do new blkg allocation with GFP_KERNEL. */ + spin_unlock_irq(q->queue_lock); rcu_read_unlock(); - spin_unlock_irq(disk->queue->queue_lock); - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); - /* - * If queue was bypassing, we should retry. Do so after a - * short msleep(). It isn't strictly necessary but queue - * can be bypassing for some time and it's always nice to - * avoid busy looping. - */ - if (ret == -EBUSY) { - msleep(10); - ret = restart_syscall(); + + new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto fail; } - return ret; - } + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + + blkg = blkg_lookup_check(pos, pol, q); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) { + blkg_free(new_blkg); + } else { + blkg = blkg_create(pos, q, new_blkg); + if (unlikely(IS_ERR(blkg))) { + ret = PTR_ERR(blkg); + goto fail_unlock; + } + } + + if (pos == blkcg) + goto success; + } +success: ctx->disk = disk; ctx->blkg = blkg; ctx->body = body; return 0; + +fail_unlock: + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); +fail: + owner = disk->fops->owner; + put_disk(disk); + module_put(owner); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue + * can be bypassing for some time and it's always nice to + * avoid busy looping. + */ + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); + } + return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); diff --git a/block/blk-core.c b/block/blk-core.c index d772c221cc17..8654aa0cef6d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -500,6 +500,13 @@ void blk_set_queue_dying(struct request_queue *q) queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(q->queue_lock); + /* + * When queue DYING flag is set, we need to block new req + * entering queue, so we call blk_freeze_queue_start() to + * prevent I/O from crossing blk_queue_enter(). + */ + blk_freeze_queue_start(q); + if (q->mq_ops) blk_mq_wake_waiters(q); else { @@ -669,6 +676,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait) if (nowait) return -EBUSY; + /* + * read pair of barrier in blk_freeze_queue_start(), + * we need to order reading __PERCPU_REF_DEAD flag of + * .q_usage_counter and reading .mq_freeze_depth or + * queue dying flag, otherwise the following wait may + * never return if the two reads are reordered. + */ + smp_rmb(); + ret = wait_event_interruptible(q->mq_freeze_wq, !atomic_read(&q->mq_freeze_depth) || blk_queue_dying(q)); @@ -720,6 +736,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q->backing_dev_info) goto fail_split; + q->stats = blk_alloc_queue_stats(); + if (!q->stats) + goto fail_stats; + q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; @@ -776,6 +796,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: + blk_free_queue_stats(q->stats); +fail_stats: bdi_put(q->backing_dev_info); fail_split: bioset_free(q->bio_split); @@ -889,7 +911,6 @@ out_exit_flush_rq: q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); - wbt_exit(q); return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1128,7 +1149,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - blk_rq_set_prio(rq, ioc); rq->cmd_flags = op; rq->rq_flags = rq_flags; @@ -1615,6 +1635,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->errors = 0; req->__sector = bio->bi_iter.bi_sector; + blk_rq_set_prio(req, rq_ioc(bio)); if (ioprio_valid(bio_prio(bio))) req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); @@ -1936,7 +1957,13 @@ generic_make_request_checks(struct bio *bio) if (!blkcg_bio_issue_check(q, bio)) return false; - trace_block_bio_queue(q, bio); + if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_queue(q, bio); + /* Now that enqueuing has been traced, we need to trace + * completion as well. + */ + bio_set_flag(bio, BIO_TRACE_COMPLETION); + } return true; not_supported: @@ -2478,7 +2505,7 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - blk_stat_set_issue_time(&req->issue_stat); + blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req)); req->rq_flags |= RQF_STATS; wbt_issue(req->q->rq_wb, &req->issue_stat); } @@ -2601,6 +2628,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; @@ -2699,7 +2728,7 @@ void blk_finish_request(struct request *req, int error) struct request_queue *q = req->q; if (req->rq_flags & RQF_STATS) - blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); + blk_stat_add(req); if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, req); diff --git a/block/blk-flush.c b/block/blk-flush.c index 0d5a9c1da1fc..4e951d3bf548 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq) * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. If WAIT flag is not passed then caller may check only what - * request was pushed in some internal queue for later handling. + * wish to. */ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 9f0ff5ba4f84..b3622cb00fc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) return 0; } -static struct blk_integrity_profile nop_profile = { +static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, diff --git a/block/blk-lib.c b/block/blk-lib.c index ed1e78e24db0..e5b853f2b8a2 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_IFL_* flags to control behaviour + * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f6d917977b33..4b3f962a9c7a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -43,6 +43,42 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file, return ret; } +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + if (stat->nr_samples) { + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); + } else { + seq_puts(m, "samples=0"); + } +} + +static int queue_poll_stat_show(struct seq_file *m, void *v) +{ + struct request_queue *q = m->private; + + seq_puts(m, "read: "); + print_stat(m, &q->poll_stat[READ]); + seq_puts(m, "\n"); + + seq_puts(m, "write: "); + print_stat(m, &q->poll_stat[WRITE]); + seq_puts(m, "\n"); + return 0; +} + +static int queue_poll_stat_open(struct inode *inode, struct file *file) +{ + return single_open(file, queue_poll_stat_show, inode->i_private); +} + +static const struct file_operations queue_poll_stat_fops = { + .open = queue_poll_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int hctx_state_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -322,60 +358,6 @@ static const struct file_operations hctx_io_poll_fops = { .release = single_release, }; -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); -} - -static int hctx_stats_show(struct seq_file *m, void *v) -{ - struct blk_mq_hw_ctx *hctx = m->private; - struct blk_rq_stat stat[2]; - - blk_stat_init(&stat[BLK_STAT_READ]); - blk_stat_init(&stat[BLK_STAT_WRITE]); - - blk_hctx_stat_get(hctx, stat); - - seq_puts(m, "read: "); - print_stat(m, &stat[BLK_STAT_READ]); - seq_puts(m, "\n"); - - seq_puts(m, "write: "); - print_stat(m, &stat[BLK_STAT_WRITE]); - seq_puts(m, "\n"); - return 0; -} - -static int hctx_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, hctx_stats_show, inode->i_private); -} - -static ssize_t hctx_stats_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct seq_file *m = file->private_data; - struct blk_mq_hw_ctx *hctx = m->private; - struct blk_mq_ctx *ctx; - int i; - - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } - return count; -} - -static const struct file_operations hctx_stats_fops = { - .open = hctx_stats_open, - .read = seq_read, - .write = hctx_stats_write, - .llseek = seq_lseek, - .release = single_release, -}; - static int hctx_dispatched_show(struct seq_file *m, void *v) { struct blk_mq_hw_ctx *hctx = m->private; @@ -636,6 +618,11 @@ static const struct file_operations ctx_completed_fops = { .release = single_release, }; +static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { + {"poll_stat", 0400, &queue_poll_stat_fops}, + {}, +}; + static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, &hctx_state_fops}, {"flags", 0400, &hctx_flags_fops}, @@ -646,7 +633,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"sched_tags", 0400, &hctx_sched_tags_fops}, {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops}, {"io_poll", 0600, &hctx_io_poll_fops}, - {"stats", 0600, &hctx_stats_fops}, {"dispatched", 0600, &hctx_dispatched_fops}, {"queued", 0600, &hctx_queued_fops}, {"run", 0600, &hctx_run_fops}, @@ -753,6 +739,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q) if (!q->mq_debugfs_dir) goto err; + if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs)) + goto err; + queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_debugfs_register_hctx(q, hctx)) goto err; diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 966c2169762e..0c3354cf3552 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -23,7 +23,7 @@ * @pdev: PCI device associated with @set. * * This function assumes the PCI device @pdev has at least as many available - * interrupt vetors as @set has queues. It will then queuery the vector + * interrupt vectors as @set has queues. It will then query the vector * corresponding to each queue for it's affinity mask and built queue mapping * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. diff --git a/block/blk-mq.c b/block/blk-mq.c index 572966f49596..724bcec0ca4f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -39,6 +39,9 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); +static void blk_mq_poll_stats_start(struct request_queue *q); +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); + /* * Check if any of the ctx's have pending work in this hardware queue */ @@ -65,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } -void blk_mq_freeze_queue_start(struct request_queue *q) +void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; @@ -75,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q) blk_mq_run_hw_queues(q, false); } } -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); +EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { @@ -105,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q) * no blk_unfreeze_queue(), and blk_freeze_queue() is not * exported to drivers as the only user for unfreeze is blk_mq. */ - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } @@ -431,15 +434,8 @@ static void blk_mq_ipi_complete_request(struct request *rq) static void blk_mq_stat_add(struct request *rq) { if (rq->rq_flags & RQF_STATS) { - /* - * We could rq->mq_ctx here, but there's less of a risk - * of races if we have the completion event add the stats - * to the local software queue. - */ - struct blk_mq_ctx *ctx; - - ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); - blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); + blk_mq_poll_stats_start(rq->q); + blk_stat_add(rq); } } @@ -491,7 +487,7 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - blk_stat_set_issue_time(&rq->issue_stat); + blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); rq->rq_flags |= RQF_STATS; wbt_issue(q->rq_wb, &rq->issue_stat); } @@ -526,6 +522,15 @@ void blk_mq_start_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_start_request); +/* + * When we reach here because queue is busy, REQ_ATOM_COMPLETE + * flag isn't set yet, so there may be race with timeout handler, + * but given rq->deadline is just set in .queue_rq() under + * this situation, the race won't be possible in reality because + * rq->timeout should be set as big enough to cover the window + * between blk_mq_start_request() called from .queue_rq() and + * clearing REQ_ATOM_STARTED here. + */ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -666,7 +671,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) * just be ignored. This can happen due to the bitflag ordering. * Timeout first checks if STARTED is set, and if it is, assumes * the request is active. But if we race with completion, then - * we both flags will get cleared. So check here again, and ignore + * both flags will get cleared. So check here again, and ignore * a timeout event with a request that isn't active. */ if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) @@ -699,6 +704,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) return; + /* + * The rq being checked may have been freed and reallocated + * out already here, we avoid this race by checking rq->deadline + * and REQ_ATOM_COMPLETE flag together: + * + * - if rq->deadline is observed as new value because of + * reusing, the rq won't be timed out because of timing. + * - if rq->deadline is observed as previous value, + * REQ_ATOM_COMPLETE flag won't be cleared in reuse path + * because we put a barrier between setting rq->deadline + * and clearing the flag in blk_mq_start_request(), so + * this rq won't be timed out too. + */ if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) blk_mq_rq_timed_out(rq, reserved); @@ -727,7 +745,7 @@ static void blk_mq_timeout_work(struct work_struct *work) * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in - * blk_mq_freeze_queue_start, and the moment the last request is + * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ @@ -964,20 +982,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { struct blk_mq_hw_ctx *hctx; struct request *rq; - LIST_HEAD(driver_list); - struct list_head *dptr; int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; if (list_empty(list)) return false; /* - * Start off with dptr being NULL, so we start the first request - * immediately, even if we have more pending. - */ - dptr = NULL; - - /* * Now process all the entries, sending them to the driver. */ errors = queued = 0; @@ -1009,7 +1019,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) list_del_init(&rq->queuelist); bd.rq = rq; - bd.list = dptr; /* * Flag last if we have no more requests, or if we have more @@ -1045,13 +1054,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; - - /* - * We've done the first request. If we have more than 1 - * left in the list, set dptr to defer issue. - */ - if (!dptr && list->next != list->prev) - dptr = &driver_list; } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; @@ -1104,6 +1106,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { + might_sleep(); + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); @@ -1453,13 +1457,12 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, +static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, bool may_sleep) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .list = NULL, .last = 1 }; struct blk_mq_hw_ctx *hctx; @@ -1485,8 +1488,6 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, return; } - __blk_mq_requeue_request(rq); - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { *cookie = BLK_QC_T_NONE; rq->errors = -EIO; @@ -1494,22 +1495,36 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, return; } + __blk_mq_requeue_request(rq); insert: blk_mq_sched_insert_request(rq, false, true, false, may_sleep); } -/* - * Multiple hardware queue variant. This will not use per-process plugs, - * but will attempt to bypass the hctx queueing if we can go straight to - * hardware for SYNC IO. - */ +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + __blk_mq_try_issue_directly(rq, cookie, false); + rcu_read_unlock(); + } else { + unsigned int srcu_idx; + + might_sleep(); + + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + __blk_mq_try_issue_directly(rq, cookie, true); + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + } +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; - unsigned int request_count = 0, srcu_idx; + unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1545,145 +1560,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) cookie = request_to_qc_t(data.hctx, rq); - if (unlikely(is_flush_fua)) { - if (q->elevator) - goto elv_insert; - blk_mq_bio_to_request(rq, bio); - blk_insert_flush(rq); - goto run_queue; - } - plug = current->plug; - /* - * If the driver supports defer issued based on 'last', then - * queue it up like normal since we can potentially save some - * CPU this way. - */ - if (((plug && !blk_queue_nomerges(q)) || is_sync) && - !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct request *old_rq = NULL; - + if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); - - /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - */ - if (plug) { - /* - * The plug list might get flushed before this. If that - * happens, same_queue_rq is invalid and plug list is - * empty - */ - if (same_queue_rq && !list_empty(&plug->mq_list)) { - old_rq = same_queue_rq; - list_del_init(&old_rq->queuelist); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - } else /* is_sync */ - old_rq = rq; - blk_mq_put_ctx(data.ctx); - if (!old_rq) - goto done; - - if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_try_issue_directly(old_rq, &cookie, false); - rcu_read_unlock(); + if (q->elevator) { + blk_mq_sched_insert_request(rq, false, true, true, + true); } else { - srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); - blk_mq_try_issue_directly(old_rq, &cookie, true); - srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } - goto done; - } - - if (q->elevator) { -elv_insert: - blk_mq_put_ctx(data.ctx); - blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua, true); - goto done; - } - if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { - /* - * For a SYNC request, send it to the hardware immediately. For - * an ASYNC request, just ensure that we run it later on. The - * latter allows for merging opportunities and more efficient - * dispatching. - */ -run_queue: - blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); - } - blk_mq_put_ctx(data.ctx); -done: - return cookie; -} - -/* - * Single hardware queue variant. This will attempt to use any per-process - * plug for merging and IO deferral. - */ -static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) -{ - const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_plug *plug; - unsigned int request_count = 0; - struct blk_mq_alloc_data data = { .flags = 0 }; - struct request *rq; - blk_qc_t cookie; - unsigned int wb_acct; - - blk_queue_bounce(q, &bio); - - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_io_error(bio); - return BLK_QC_T_NONE; - } - - blk_queue_split(q, &bio, q->bio_split); - - if (!is_flush_fua && !blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); - - if (blk_mq_sched_bio_merge(q, bio)) - return BLK_QC_T_NONE; - - wb_acct = wbt_wait(q->rq_wb, bio, NULL); - - trace_block_getrq(q, bio, bio->bi_opf); - - rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); - if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); - return BLK_QC_T_NONE; - } - - wbt_track(&rq->issue_stat, wb_acct); - - cookie = request_to_qc_t(data.hctx, rq); - - if (unlikely(is_flush_fua)) { - if (q->elevator) - goto elv_insert; - blk_mq_bio_to_request(rq, bio); - blk_insert_flush(rq); - goto run_queue; - } - - /* - * A task plug currently exists. Since this is completely lockless, - * utilize that to temporarily store requests until the task is - * either done or scheduled away. - */ - plug = current->plug; - if (plug) { + } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; blk_mq_bio_to_request(rq, bio); @@ -1694,13 +1581,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) */ if (list_empty(&plug->mq_list)) request_count = 0; + else if (blk_queue_nomerges(q)) + request_count = blk_plug_queued_count(q); + if (!request_count) trace_block_plug(q); else last = list_entry_rq(plug->mq_list.prev); - blk_mq_put_ctx(data.ctx); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); @@ -1708,30 +1596,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } list_add_tail(&rq->queuelist, &plug->mq_list); - return cookie; - } - - if (q->elevator) { -elv_insert: - blk_mq_put_ctx(data.ctx); + } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, - !is_sync || is_flush_fua, true); - goto done; - } - if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { + /* - * For a SYNC request, send it to the hardware immediately. For - * an ASYNC request, just ensure that we run it later on. The - * latter allows for merging opportunities and more efficient - * dispatching. + * We do limited plugging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most + * The plug list might get flushed before this. If that happens, + * the plug list is empty, and same_queue_rq is invalid. */ -run_queue: - blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); - } + if (list_empty(&plug->mq_list)) + same_queue_rq = NULL; + if (same_queue_rq) + list_del_init(&same_queue_rq->queuelist); + list_add_tail(&rq->queuelist, &plug->mq_list); + + blk_mq_put_ctx(data.ctx); + + if (same_queue_rq) + blk_mq_try_issue_directly(data.hctx, same_queue_rq, + &cookie); + + return cookie; + } else if (q->nr_hw_queues > 1 && is_sync) { + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_try_issue_directly(data.hctx, rq, &cookie); + return cookie; + } else if (q->elevator) { + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, true, true); + } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) + blk_mq_run_hw_queue(data.hctx, true); blk_mq_put_ctx(data.ctx); -done: return cookie; } @@ -2067,8 +1966,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - blk_stat_init(&__ctx->stat[BLK_STAT_READ]); - blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) @@ -2364,6 +2261,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, /* mark the queue as mq asap */ q->mq_ops = set->ops; + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, + blk_stat_rq_ddir, 2, q); + if (!q->poll_cb) + goto err_exit; + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) goto err_exit; @@ -2398,10 +2300,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - if (q->nr_hw_queues > 1) - blk_queue_make_request(q, blk_mq_make_request); - else - blk_queue_make_request(q, blk_sq_make_request); + blk_queue_make_request(q, blk_mq_make_request); /* * Do this after blk_queue_make_request() overrides it... @@ -2456,8 +2355,6 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); - wbt_exit(q); - blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); @@ -2502,7 +2399,7 @@ static void blk_mq_queue_reinit_work(void) * take place in parallel. */ list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_freeze_queue_start(q); + blk_freeze_queue_start(q); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_wait(q); @@ -2755,16 +2652,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); - - /* - * Manually set the make_request_fn as blk_queue_make_request - * resets a lot of the queue settings. - */ - if (q->nr_hw_queues > 1) - q->make_request_fn = blk_mq_make_request; - else - q->make_request_fn = blk_sq_make_request; - blk_mq_queue_reinit(q, cpu_online_mask); } @@ -2773,28 +2660,53 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +/* Enable polling stats and return whether they were already enabled. */ +static bool blk_poll_stats_enable(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + return true; + blk_stat_add_callback(q, q->poll_cb); + return false; +} + +static void blk_mq_poll_stats_start(struct request_queue *q) +{ + /* + * We don't arm the callback if polling stats are not enabled or the + * callback is already active. + */ + if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + blk_stat_is_active(q->poll_cb)) + return; + + blk_stat_activate_msecs(q->poll_cb, 100); +} + +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) +{ + struct request_queue *q = cb->data; + + if (cb->stat[READ].nr_samples) + q->poll_stat[READ] = cb->stat[READ]; + if (cb->stat[WRITE].nr_samples) + q->poll_stat[WRITE] = cb->stat[WRITE]; +} + static unsigned long blk_mq_poll_nsecs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_rq_stat stat[2]; unsigned long ret = 0; /* * If stats collection isn't on, don't sleep but turn it on for * future users */ - if (!blk_stat_enable(q)) + if (!blk_poll_stats_enable(q)) return 0; /* - * We don't have to do this once per IO, should optimize this - * to just use the current window of stats until it changes - */ - memset(&stat, 0, sizeof(stat)); - blk_hctx_stat_get(hctx, stat); - - /* * As an optimistic guess, use half of the mean service time * for this type of request. We can (and should) make this smarter. * For instance, if the completion latencies are tight, we can @@ -2802,10 +2714,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, * important on devices where the completion latencies are longer * than ~10 usec. */ - if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) - ret = (stat[BLK_STAT_READ].mean + 1) / 2; - else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) - ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; + if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples) + ret = (q->poll_stat[READ].mean + 1) / 2; + else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples) + ret = (q->poll_stat[WRITE].mean + 1) / 2; return ret; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 660a17e1d033..7e6f2e467696 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -20,7 +20,6 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; - struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c index 186fcb981e9b..e77ec52f5bb5 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -4,10 +4,33 @@ * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> +#include <linux/rculist.h> #include <linux/blk-mq.h> #include "blk-stat.h" #include "blk-mq.h" +#include "blk.h" + +#define BLK_RQ_STAT_BATCH 64 + +struct blk_queue_stats { + struct list_head callbacks; + spinlock_t lock; + bool enable_accounting; +}; + +unsigned int blk_stat_rq_ddir(const struct request *rq) +{ + return rq_data_dir(rq); +} +EXPORT_SYMBOL_GPL(blk_stat_rq_ddir); + +static void blk_stat_init(struct blk_rq_stat *stat) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; +} static void blk_stat_flush_batch(struct blk_rq_stat *stat) { @@ -48,209 +71,183 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) dst->nr_samples += src->nr_samples; } -static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - uint64_t latest = 0; - int i, j, nr; - - blk_stat_init(&dst[BLK_STAT_READ]); - blk_stat_init(&dst[BLK_STAT_WRITE]); - - nr = 0; - do { - uint64_t newest = 0; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); - blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); - - if (!ctx->stat[BLK_STAT_READ].nr_samples && - !ctx->stat[BLK_STAT_WRITE].nr_samples) - continue; - if (ctx->stat[BLK_STAT_READ].time > newest) - newest = ctx->stat[BLK_STAT_READ].time; - if (ctx->stat[BLK_STAT_WRITE].time > newest) - newest = ctx->stat[BLK_STAT_WRITE].time; - } - } + stat->min = min(stat->min, value); + stat->max = max(stat->max, value); - /* - * No samples - */ - if (!newest) - break; - - if (newest > latest) - latest = newest; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - if (ctx->stat[BLK_STAT_READ].time == newest) { - blk_stat_sum(&dst[BLK_STAT_READ], - &ctx->stat[BLK_STAT_READ]); - nr++; - } - if (ctx->stat[BLK_STAT_WRITE].time == newest) { - blk_stat_sum(&dst[BLK_STAT_WRITE], - &ctx->stat[BLK_STAT_WRITE]); - nr++; - } - } - } - /* - * If we race on finding an entry, just loop back again. - * Should be very rare. - */ - } while (!nr); + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); - dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; + stat->batch += value; + stat->nr_batch++; } -void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +void blk_stat_add(struct request *rq) { - if (q->mq_ops) - blk_mq_stat_get(q, dst); - else { - blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); - blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); - memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], - sizeof(struct blk_rq_stat)); - memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], - sizeof(struct blk_rq_stat)); + struct request_queue *q = rq->q; + struct blk_stat_callback *cb; + struct blk_rq_stat *stat; + int bucket; + s64 now, value; + + now = __blk_stat_time(ktime_to_ns(ktime_get())); + if (now < blk_stat_time(&rq->issue_stat)) + return; + + value = now - blk_stat_time(&rq->issue_stat); + + blk_throtl_stat_add(rq, value); + + rcu_read_lock(); + list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { + if (blk_stat_is_active(cb)) { + bucket = cb->bucket_fn(rq); + stat = &this_cpu_ptr(cb->cpu_stat)[bucket]; + __blk_stat_add(stat, value); + } } + rcu_read_unlock(); } -void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +static void blk_stat_timer_fn(unsigned long data) { - struct blk_mq_ctx *ctx; - unsigned int i, nr; - - nr = 0; - do { - uint64_t newest = 0; + struct blk_stat_callback *cb = (void *)data; + unsigned int bucket; + int cpu; - hctx_for_each_ctx(hctx, ctx, i) { - blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); - blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_stat_init(&cb->stat[bucket]); - if (!ctx->stat[BLK_STAT_READ].nr_samples && - !ctx->stat[BLK_STAT_WRITE].nr_samples) - continue; + for_each_online_cpu(cpu) { + struct blk_rq_stat *cpu_stat; - if (ctx->stat[BLK_STAT_READ].time > newest) - newest = ctx->stat[BLK_STAT_READ].time; - if (ctx->stat[BLK_STAT_WRITE].time > newest) - newest = ctx->stat[BLK_STAT_WRITE].time; + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) { + blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); + blk_stat_init(&cpu_stat[bucket]); } + } - if (!newest) - break; - - hctx_for_each_ctx(hctx, ctx, i) { - if (ctx->stat[BLK_STAT_READ].time == newest) { - blk_stat_sum(&dst[BLK_STAT_READ], - &ctx->stat[BLK_STAT_READ]); - nr++; - } - if (ctx->stat[BLK_STAT_WRITE].time == newest) { - blk_stat_sum(&dst[BLK_STAT_WRITE], - &ctx->stat[BLK_STAT_WRITE]); - nr++; - } - } - /* - * If we race on finding an entry, just loop back again. - * Should be very rare, as the window is only updated - * occasionally - */ - } while (!nr); + cb->timer_fn(cb); } -static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + unsigned int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data) { - stat->min = -1ULL; - stat->max = stat->nr_samples = stat->mean = 0; - stat->batch = stat->nr_batch = 0; - stat->time = time_now & BLK_STAT_NSEC_MASK; -} + struct blk_stat_callback *cb; -void blk_stat_init(struct blk_rq_stat *stat) -{ - __blk_stat_init(stat, ktime_to_ns(ktime_get())); -} + cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return NULL; -static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) -{ - return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); + cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), + GFP_KERNEL); + if (!cb->stat) { + kfree(cb); + return NULL; + } + cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), + __alignof__(struct blk_rq_stat)); + if (!cb->cpu_stat) { + kfree(cb->stat); + kfree(cb); + return NULL; + } + + cb->timer_fn = timer_fn; + cb->bucket_fn = bucket_fn; + cb->data = data; + cb->buckets = buckets; + setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb); + + return cb; } +EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); -bool blk_stat_is_current(struct blk_rq_stat *stat) +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb) { - return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); + unsigned int bucket; + int cpu; + + for_each_possible_cpu(cpu) { + struct blk_rq_stat *cpu_stat; + + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_stat_init(&cpu_stat[bucket]); + } + + spin_lock(&q->stats->lock); + list_add_tail_rcu(&cb->list, &q->stats->callbacks); + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); } +EXPORT_SYMBOL_GPL(blk_stat_add_callback); -void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb) { - s64 now, value; + spin_lock(&q->stats->lock); + list_del_rcu(&cb->list); + if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + clear_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); - now = __blk_stat_time(ktime_to_ns(ktime_get())); - if (now < blk_stat_time(&rq->issue_stat)) - return; - - if (!__blk_stat_is_current(stat, now)) - __blk_stat_init(stat, now); + del_timer_sync(&cb->timer); +} +EXPORT_SYMBOL_GPL(blk_stat_remove_callback); - value = now - blk_stat_time(&rq->issue_stat); - if (value > stat->max) - stat->max = value; - if (value < stat->min) - stat->min = value; +static void blk_stat_free_callback_rcu(struct rcu_head *head) +{ + struct blk_stat_callback *cb; - if (stat->batch + value < stat->batch || - stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) - blk_stat_flush_batch(stat); + cb = container_of(head, struct blk_stat_callback, rcu); + free_percpu(cb->cpu_stat); + kfree(cb->stat); + kfree(cb); +} - stat->batch += value; - stat->nr_batch++; +void blk_stat_free_callback(struct blk_stat_callback *cb) +{ + if (cb) + call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +EXPORT_SYMBOL_GPL(blk_stat_free_callback); -void blk_stat_clear(struct request_queue *q) +void blk_stat_enable_accounting(struct request_queue *q) { - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - int i, j; - - queue_for_each_hw_ctx(q, hctx, i) { - hctx_for_each_ctx(hctx, ctx, j) { - blk_stat_init(&ctx->stat[BLK_STAT_READ]); - blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); - } - } - } else { - blk_stat_init(&q->rq_stats[BLK_STAT_READ]); - blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); - } + spin_lock(&q->stats->lock); + q->stats->enable_accounting = true; + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + spin_unlock(&q->stats->lock); } -void blk_stat_set_issue_time(struct blk_issue_stat *stat) +struct blk_queue_stats *blk_alloc_queue_stats(void) { - stat->time = (stat->time & BLK_STAT_MASK) | - (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); + struct blk_queue_stats *stats; + + stats = kmalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + return NULL; + + INIT_LIST_HEAD(&stats->callbacks); + spin_lock_init(&stats->lock); + stats->enable_accounting = false; + + return stats; } -/* - * Enable stat tracking, return whether it was enabled - */ -bool blk_stat_enable(struct request_queue *q) +void blk_free_queue_stats(struct blk_queue_stats *stats) { - if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - set_bit(QUEUE_FLAG_STATS, &q->queue_flags); - return false; - } + if (!stats) + return; + + WARN_ON(!list_empty(&stats->callbacks)); - return true; + kfree(stats); } diff --git a/block/blk-stat.h b/block/blk-stat.h index a2050a0a5314..53f08a63bf15 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -1,33 +1,84 @@ #ifndef BLK_STAT_H #define BLK_STAT_H -/* - * ~0.13s window as a power-of-2 (2^27 nsecs) - */ -#define BLK_STAT_NSEC 134217728ULL -#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/ktime.h> +#include <linux/rcupdate.h> +#include <linux/timer.h> /* - * Upper 3 bits can be used elsewhere + * from upper: + * 3 bits: reserved for other usage + * 12 bits: size + * 49 bits: time */ #define BLK_STAT_RES_BITS 3 -#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) -#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) -#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK +#define BLK_STAT_SIZE_BITS 12 +#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS) +#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS) +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1) +#define BLK_STAT_SIZE_MASK \ + (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT) +#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1)) + +/** + * struct blk_stat_callback - Block statistics callback. + * + * A &struct blk_stat_callback is associated with a &struct request_queue. While + * @timer is active, that queue's request completion latencies are sorted into + * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the + * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked. + */ +struct blk_stat_callback { + /* + * @list: RCU list of callbacks for a &struct request_queue. + */ + struct list_head list; + + /** + * @timer: Timer for the next callback invocation. + */ + struct timer_list timer; + + /** + * @cpu_stat: Per-cpu statistics buckets. + */ + struct blk_rq_stat __percpu *cpu_stat; -enum { - BLK_STAT_READ = 0, - BLK_STAT_WRITE, + /** + * @bucket_fn: Given a request, returns which statistics bucket it + * should be accounted under. + */ + unsigned int (*bucket_fn)(const struct request *); + + /** + * @buckets: Number of statistics buckets. + */ + unsigned int buckets; + + /** + * @stat: Array of statistics buckets. + */ + struct blk_rq_stat *stat; + + /** + * @fn: Callback function. + */ + void (*timer_fn)(struct blk_stat_callback *); + + /** + * @data: Private pointer for the user. + */ + void *data; + + struct rcu_head rcu; }; -void blk_stat_add(struct blk_rq_stat *, struct request *); -void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); -void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); -void blk_stat_clear(struct request_queue *); -void blk_stat_init(struct blk_rq_stat *); -bool blk_stat_is_current(struct blk_rq_stat *); -void blk_stat_set_issue_time(struct blk_issue_stat *); -bool blk_stat_enable(struct request_queue *); +struct blk_queue_stats *blk_alloc_queue_stats(void); +void blk_free_queue_stats(struct blk_queue_stats *); + +void blk_stat_add(struct request *); static inline u64 __blk_stat_time(u64 time) { @@ -36,7 +87,128 @@ static inline u64 __blk_stat_time(u64 time) static inline u64 blk_stat_time(struct blk_issue_stat *stat) { - return __blk_stat_time(stat->time); + return __blk_stat_time(stat->stat); +} + +static inline sector_t blk_capped_size(sector_t size) +{ + return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1); +} + +static inline sector_t blk_stat_size(struct blk_issue_stat *stat) +{ + return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT; +} + +static inline void blk_stat_set_issue(struct blk_issue_stat *stat, + sector_t size) +{ + stat->stat = (stat->stat & BLK_STAT_RES_MASK) | + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) | + (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT); +} + +/* record time/size info in request but not add a callback */ +void blk_stat_enable_accounting(struct request_queue *q); + +/* + * blk_stat_rq_ddir() - Bucket callback function for the request data direction. + * @rq: Request. + * + * This is the same as rq_data_dir() but as a function so it can be used as + * @bucket_fn for blk_stat_alloc_callback(). + * + * Return: Data direction of the request, either READ or WRITE. + */ +unsigned int blk_stat_rq_ddir(const struct request *rq); + +/** + * blk_stat_alloc_callback() - Allocate a block statistics callback. + * @timer_fn: Timer callback function. + * @bucket_fn: Bucket callback function. + * @buckets: Number of statistics buckets. + * @data: Value for the @data field of the &struct blk_stat_callback. + * + * See &struct blk_stat_callback for details on the callback functions. + * + * Return: &struct blk_stat_callback on success or NULL on ENOMEM. + */ +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + unsigned int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data); + +/** + * blk_stat_add_callback() - Add a block statistics callback to be run on a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * Note that a single &struct blk_stat_callback can only be added to a single + * &struct request_queue. + */ +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_remove_callback() - Remove a block statistics callback from a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * When this returns, the callback is not running on any CPUs and will not be + * called again unless readded. + */ +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_free_callback() - Free a block statistics callback. + * @cb: The callback. + * + * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must + * not be associated with a request queue. I.e., if it was previously added with + * blk_stat_add_callback(), it must also have been removed since then with + * blk_stat_remove_callback(). + */ +void blk_stat_free_callback(struct blk_stat_callback *cb); + +/** + * blk_stat_is_active() - Check if a block statistics callback is currently + * gathering statistics. + * @cb: The callback. + */ +static inline bool blk_stat_is_active(struct blk_stat_callback *cb) +{ + return timer_pending(&cb->timer); +} + +/** + * blk_stat_activate_nsecs() - Gather block statistics during a time window in + * nanoseconds. + * @cb: The callback. + * @nsecs: Number of nanoseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, + u64 nsecs) +{ + mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); +} + +/** + * blk_stat_activate_msecs() - Gather block statistics during a time window in + * milliseconds. + * @cb: The callback. + * @msecs: Number of milliseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, + unsigned int msecs) +{ + mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); } #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 37f0b3ad635e..c47db43a40cc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } -static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) -{ - return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", - pre, (long long) stat->nr_samples, - (long long) stat->mean, (long long) stat->min, - (long long) stat->max); -} - -static ssize_t queue_stats_show(struct request_queue *q, char *page) -{ - struct blk_rq_stat stat[2]; - ssize_t ret; - - blk_queue_stat_get(q, stat); - - ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); - ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); - return ret; -} - static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; -static struct queue_sysfs_entry queue_stats_entry = { - .attr = {.name = "stats", .mode = S_IRUGO }, - .show = queue_stats_show, -}; - static struct queue_sysfs_entry queue_wb_lat_entry = { .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, .show = queue_wb_lat_show, .store = queue_wb_lat_store, }; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static struct queue_sysfs_entry throtl_sample_time_entry = { + .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR }, + .show = blk_throtl_sample_time_show, + .store = blk_throtl_sample_time_store, +}; +#endif + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_dax_entry.attr, - &queue_stats_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + &throtl_sample_time_entry.attr, +#endif NULL, }; @@ -810,7 +795,9 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); - wbt_exit(q); + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); bdi_put(q->backing_dev_info); blkcg_exit_queue(q); @@ -819,6 +806,8 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q, q->elevator); } + blk_free_queue_stats(q->stats); + blk_exit_rl(&q->root_rl); if (q->queue_tags) @@ -881,6 +870,11 @@ int blk_register_queue(struct gendisk *disk) if (WARN_ON(!q)) return -ENXIO; + WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), + "%s is registering an already registered queue\n", + kobject_name(&dev->kobj)); + queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); + /* * SCSI probing may synchronously create and destroy a lot of * request_queues for non-existent devices. Shutting down a fully @@ -916,6 +910,8 @@ int blk_register_queue(struct gendisk *disk) blk_wb_init(q); + blk_throtl_register_queue(q); + if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { @@ -939,6 +935,11 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; + queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); + + wbt_exit(q); + + if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8fab716e4059..c82bf9b1fe72 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8; /* Total max dispatch from all groups in one round */ static int throtl_quantum = 32; -/* Throttling is performed over 100ms slice and after that slice is renewed */ -static unsigned long throtl_slice = HZ/10; /* 100 ms */ +/* Throttling is performed over a slice and after that slice is renewed */ +#define DFL_THROTL_SLICE_HD (HZ / 10) +#define DFL_THROTL_SLICE_SSD (HZ / 50) +#define MAX_THROTL_SLICE (HZ) +#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ +#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ +#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ +/* default latency target is 0, eg, guarantee IO latency by default */ +#define DFL_LATENCY_TARGET (0) + +#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) static struct blkcg_policy blkcg_policy_throtl; @@ -83,6 +92,12 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -119,20 +134,54 @@ struct throtl_grp { /* are there any throtl rules between this group and td? */ bool has_rules[2]; - /* bytes per second rate limits */ - uint64_t bps[2]; + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; - /* IOPS limits */ - unsigned int iops[2]; + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; /* Number of bytes disptached in current slice */ uint64_t bytes_disp[2]; /* Number of bio's dispatched in current slice */ unsigned int io_disp[2]; + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; +}; + +/* We measure latency for request size from <= 4k to >= 1M */ +#define LATENCY_BUCKET_SIZE 9 + +struct latency_bucket { + unsigned long total_latency; /* ns / 1024 */ + int samples; +}; + +struct avg_latency_bucket { + unsigned long latency; /* ns / 1024 */ + bool valid; }; struct throtl_data @@ -145,8 +194,26 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; + unsigned int throtl_slice; + /* Work for dispatching throttled bios */ struct work_struct dispatch_work; + unsigned int limit_index; + bool limit_valid[LIMIT_CNT]; + + unsigned long dft_idletime_threshold; /* us */ + + unsigned long low_upgrade_time; + unsigned long low_downgrade_time; + + unsigned int scale; + + struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets; + unsigned long last_calculate_time; + + bool track_bio_latency; }; static void throtl_pending_timer_fn(unsigned long arg); @@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } +/* + * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to + * make the IO dispatch more smooth. + * Scale up: linearly scale up according to lapsed time since upgrade. For + * every throtl_slice, the limit scales up 1/2 .low limit till the + * limit hits .max limit + * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit + */ +static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) +{ + /* arbitrary value to avoid too big scale */ + if (td->scale < 4096 && time_after_eq(jiffies, + td->low_upgrade_time + td->scale * td->throtl_slice)) + td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; + + return low + (low >> 1) * td->scale; +} + +static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + uint64_t ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return U64_MAX; + + td = tg->td; + ret = tg->bps[rw][td->limit_index]; + if (ret == 0 && td->limit_index == LIMIT_LOW) + return tg->bps[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && + tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); + ret = min(tg->bps[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + unsigned int ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return UINT_MAX; + td = tg->td; + ret = tg->iops[rw][td->limit_index]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) + return tg->iops[rw][LIMIT_MAX]; + + if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && + tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); + if (adjusted > UINT_MAX) + adjusted = UINT_MAX; + ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +#define request_bucket_index(sectors) \ + clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) + /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported @@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; + tg->bps[READ][LIMIT_MAX] = U64_MAX; + tg->bps[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops[READ][LIMIT_MAX] = UINT_MAX; + tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; + tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; + tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; + tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; + /* LIMIT_LOW will have default value 0 */ + + tg->latency_target = DFL_LATENCY_TARGET; return &tg->pd; } @@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd) if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; + + tg->idletime_threshold = td->dft_idletime_threshold; } /* @@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); + struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || - (tg->bps[rw] != -1 || tg->iops[rw] != -1); + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX || + tg_iops_limit(tg, rw) != UINT_MAX)); } static void throtl_pd_online(struct blkg_policy_data *pd) { + struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(pd_to_tg(pd)); + tg_update_has_rules(tg); +} + +static void blk_throtl_update_limit_valid(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + bool low_valid = false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + low_valid = true; + } + rcu_read_unlock(); + + td->limit_valid[LIMIT_LOW] = low_valid; +} + +static void throtl_upgrade_state(struct throtl_data *td); +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + + blk_throtl_update_limit_valid(tg->td); + + if (!tg->td->limit_valid[tg->td->limit_index]) + throtl_upgrade_state(tg->td); } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { + unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; + + /* + * Since we are adjusting the throttle limit dynamically, the sleep + * time calculated according to previous limit might be invalid. It's + * possible the cgroup sleep time is very long and no other cgroups + * have IO running so notify the limit changes. Make sure the cgroup + * doesn't sleep too long to avoid the missed notification. + */ + if (time_after(expires, max_expire)) + expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); @@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, if (time_after_eq(start, tg->slice_start[rw])) tg->slice_start[rw] = start; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + throtl_slice; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = jiffies - tg->slice_start[rw]; - nr_slices = time_elapsed / throtl_slice; + nr_slices = time_elapsed / tg->td->throtl_slice; if (!nr_slices) return; - tmp = tg->bps[rw] * throtl_slice * nr_slices; + tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; do_div(tmp, HZ); bytes_trim = tmp; - io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / + HZ; if (!bytes_trim && !io_trim) return; @@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) else tg->io_disp[rw] = 0; - tg->slice_start[rw] += nr_slices * throtl_slice; + tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", @@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be @@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, * have been trimmed. */ - tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); if (tmp > UINT_MAX) @@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; if (jiffy_wait > jiffy_elapsed) jiffy_wait = jiffy_wait - jiffy_elapsed; @@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; + jiffy_elapsed_rnd = tg->td->throtl_slice; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - tmp = tg->bps[rw] * jiffy_elapsed_rnd; + tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; do_div(tmp, HZ); bytes_allowed = tmp; @@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); if (!jiffy_wait) jiffy_wait = 1; @@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (tg_bps_limit(tg, rw) == U64_MAX && + tg_iops_limit(tg, rw) == UINT_MAX) { if (wait) *wait = 0; return true; @@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw); else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(tg, rw, jiffies + throtl_slice); + if (time_before(tg->slice_end[rw], + jiffies + tg->td->throtl_slice)) + throtl_extend_slice(tg, rw, + jiffies + tg->td->throtl_slice); } if (tg_with_in_bps_limit(tg, bio, &bps_wait) && @@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) /* Charge the bio to the group */ tg->bytes_disp[rw] += bio->bi_iter.bi_size; tg->io_disp[rw]++; + tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; + tg->last_io_disp[rw]++; /* * BIO_THROTTLED is used to prevent the same bio to be throttled @@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @arg: the throtl_service_queue being serviced @@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg) int ret; spin_lock_irq(q->queue_lock); + if (throtl_can_upgrade(td, NULL)) + throtl_upgrade_state(td); + again: parent_sq = sq->parent_sq; dispatched = false; @@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); - if (v == -1) + if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); - if (v == -1) + if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } @@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg) throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", - tg->bps[READ], tg->bps[WRITE], - tg->iops[READ], tg->iops[WRITE]); + tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), + tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); /* * Update has_rules[] flags for the updated tg's subtree. A tg is @@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) - v = -1; + v = U64_MAX; tg = blkg_to_tg(ctx.blkg); @@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ]), + .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE]), + .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ]), + .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE]), + .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = { { } /* terminate */ }; -static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, +static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); char bufs[4][21] = { "max", "max", "max", "max" }; + u64 bps_dft; + unsigned int iops_dft; + char idle_time[26] = ""; + char latency_time[26] = ""; if (!dname) return 0; - if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && - tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + + if (off == LIMIT_LOW) { + bps_dft = 0; + iops_dft = 0; + } else { + bps_dft = U64_MAX; + iops_dft = UINT_MAX; + } + + if (tg->bps_conf[READ][off] == bps_dft && + tg->bps_conf[WRITE][off] == bps_dft && + tg->iops_conf[READ][off] == iops_dft && + tg->iops_conf[WRITE][off] == iops_dft && + (off != LIMIT_LOW || + (tg->idletime_threshold == tg->td->dft_idletime_threshold && + tg->latency_target == DFL_LATENCY_TARGET))) return 0; - if (tg->bps[READ] != -1) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); - if (tg->bps[WRITE] != -1) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); - if (tg->iops[READ] != -1) - snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); - if (tg->iops[WRITE] != -1) - snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); - - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3]); + if (tg->bps_conf[READ][off] != bps_dft) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tg->bps_conf[READ][off]); + if (tg->bps_conf[WRITE][off] != bps_dft) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tg->bps_conf[WRITE][off]); + if (tg->iops_conf[READ][off] != iops_dft) + snprintf(bufs[2], sizeof(bufs[2]), "%u", + tg->iops_conf[READ][off]); + if (tg->iops_conf[WRITE][off] != iops_dft) + snprintf(bufs[3], sizeof(bufs[3]), "%u", + tg->iops_conf[WRITE][off]); + if (off == LIMIT_LOW) { + if (tg->idletime_threshold == ULONG_MAX) + strcpy(idle_time, " idle=max"); + else + snprintf(idle_time, sizeof(idle_time), " idle=%lu", + tg->idletime_threshold); + + if (tg->latency_target == ULONG_MAX) + strcpy(latency_time, " latency=max"); + else + snprintf(latency_time, sizeof(latency_time), + " latency=%lu", tg->latency_target); + } + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, + latency_time); return 0; } -static int tg_print_max(struct seq_file *sf, void *v) +static int tg_print_limit(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static ssize_t tg_set_max(struct kernfs_open_file *of, +static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; + unsigned long idle_time; + unsigned long latency_time; int ret; + int index = of_cft(of)->private; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) @@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); - v[0] = tg->bps[READ]; - v[1] = tg->bps[WRITE]; - v[2] = tg->iops[READ]; - v[3] = tg->iops[WRITE]; + v[0] = tg->bps_conf[READ][index]; + v[1] = tg->bps_conf[WRITE][index]; + v[2] = tg->iops_conf[READ][index]; + v[3] = tg->iops_conf[WRITE][index]; + idle_time = tg->idletime_threshold; + latency_time = tg->latency_target; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; - u64 val = -1; + u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) @@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of, v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); + else if (off == LIMIT_LOW && !strcmp(tok, "idle")) + idle_time = val; + else if (off == LIMIT_LOW && !strcmp(tok, "latency")) + latency_time = val; else goto out_finish; } - tg->bps[READ] = v[0]; - tg->bps[WRITE] = v[1]; - tg->iops[READ] = v[2]; - tg->iops[WRITE] = v[3]; + tg->bps_conf[READ][index] = v[0]; + tg->bps_conf[WRITE][index] = v[1]; + tg->iops_conf[READ][index] = v[2]; + tg->iops_conf[WRITE][index] = v[3]; + if (index == LIMIT_MAX) { + tg->bps[READ][index] = v[0]; + tg->bps[WRITE][index] = v[1]; + tg->iops[READ][index] = v[2]; + tg->iops[WRITE][index] = v[3]; + } + tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], + tg->bps_conf[READ][LIMIT_MAX]); + tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], + tg->bps_conf[WRITE][LIMIT_MAX]); + tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], + tg->iops_conf[READ][LIMIT_MAX]); + tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], + tg->iops_conf[WRITE][LIMIT_MAX]); + + if (index == LIMIT_LOW) { + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) + tg->td->limit_index = LIMIT_LOW; + tg->idletime_threshold = (idle_time == ULONG_MAX) ? + ULONG_MAX : idle_time; + tg->latency_target = (latency_time == ULONG_MAX) ? + ULONG_MAX : latency_time; + } tg_conf_updated(tg); ret = 0; out_finish: @@ -1365,11 +1641,21 @@ out_finish: } static struct cftype throtl_files[] = { +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_LOW, + }, +#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_max, - .write = tg_set_max, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1388,9 +1674,362 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; +static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) +{ + unsigned long rtime = jiffies, wtime = jiffies; + + if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) + rtime = tg->last_low_overflow_time[READ]; + if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + wtime = tg->last_low_overflow_time[WRITE]; + return min(rtime, wtime); +} + +/* tg should not be an intermediate node */ +static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) +{ + struct throtl_service_queue *parent_sq; + struct throtl_grp *parent = tg; + unsigned long ret = __tg_last_low_overflow_time(tg); + + while (true) { + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + if (!parent) + break; + + /* + * The parent doesn't have low limit, it always reaches low + * limit. Its overflow time is useless for children + */ + if (!parent->bps[READ][LIMIT_LOW] && + !parent->iops[READ][LIMIT_LOW] && + !parent->bps[WRITE][LIMIT_LOW] && + !parent->iops[WRITE][LIMIT_LOW]) + continue; + if (time_after(__tg_last_low_overflow_time(parent), ret)) + ret = __tg_last_low_overflow_time(parent); + } + return ret; +} + +static bool throtl_tg_is_idle(struct throtl_grp *tg) +{ + /* + * cgroup is idle if: + * - single idle is too long, longer than a fixed value (in case user + * configure a too big threshold) or 4 times of slice + * - average think time is more than threshold + * - IO latency is largely below threshold + */ + unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); + + time = min_t(unsigned long, MAX_IDLE_TIME, time); + return (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && + tg->bad_bio_cnt * 5 < tg->bio_cnt); +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + bool read_limit, write_limit; + + /* + * if cgroup reaches low limit (if low limit is 0, the cgroup always + * reaches), it's ok to upgrade to next limit + */ + read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; + write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; + if (!read_limit && !write_limit) + return true; + if (read_limit && sq->nr_queued[READ] && + (!write_limit || sq->nr_queued[WRITE])) + return true; + if (write_limit && sq->nr_queued[WRITE] && + (!read_limit || sq->nr_queued[READ])) + return true; + + if (time_after_eq(jiffies, + tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && + throtl_tg_is_idle(tg)) + return true; + return false; +} + +static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) +{ + while (true) { + if (throtl_tg_can_upgrade(tg)) + return true; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + return false; + } + return false; +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + if (td->limit_index != LIMIT_LOW) + return false; + + if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) + return false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg == this_tg) + continue; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + continue; + if (!throtl_hierarchy_can_upgrade(tg)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + return true; +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_LOW) + return; + + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + tg->last_check_time = now; + + if (!time_after_eq(now, + __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) + return; + + if (throtl_can_upgrade(tg->td, NULL)) + throtl_upgrade_state(tg->td); +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->scale = 0; + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + tg->disptime = jiffies - 1; + throtl_select_dispatch(sq); + throtl_schedule_next_dispatch(sq, false); + } + rcu_read_unlock(); + throtl_select_dispatch(&td->service_queue); + throtl_schedule_next_dispatch(&td->service_queue, false); + queue_work(kthrotld_workqueue, &td->dispatch_work); +} + +static void throtl_downgrade_state(struct throtl_data *td, int new) +{ + td->scale /= 2; + + if (td->scale) { + td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; + return; + } + + td->limit_index = new; + td->low_downgrade_time = jiffies; +} + +static bool throtl_tg_can_downgrade(struct throtl_grp *tg) +{ + struct throtl_data *td = tg->td; + unsigned long now = jiffies; + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && + time_after_eq(now, tg_last_low_overflow_time(tg) + + td->throtl_slice) && + (!throtl_tg_is_idle(tg) || + !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) + return true; + return false; +} + +static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) +{ + while (true) { + if (!throtl_tg_can_downgrade(tg)) + return false; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + break; + } + return true; +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ + uint64_t bps; + unsigned int iops; + unsigned long elapsed_time; + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_MAX || + !tg->td->limit_valid[LIMIT_LOW]) + return; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + return; + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + elapsed_time = now - tg->last_check_time; + tg->last_check_time = now; + + if (time_before(now, tg_last_low_overflow_time(tg) + + tg->td->throtl_slice)) + return; + + if (tg->bps[READ][LIMIT_LOW]) { + bps = tg->last_bytes_disp[READ] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->bps[WRITE][LIMIT_LOW]) { + bps = tg->last_bytes_disp[WRITE] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + if (tg->iops[READ][LIMIT_LOW]) { + iops = tg->last_io_disp[READ] * HZ / elapsed_time; + if (iops >= tg->iops[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->iops[WRITE][LIMIT_LOW]) { + iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; + if (iops >= tg->iops[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (throtl_hierarchy_can_downgrade(tg)) + throtl_downgrade_state(tg->td, LIMIT_LOW); + + tg->last_bytes_disp[READ] = 0; + tg->last_bytes_disp[WRITE] = 0; + tg->last_io_disp[READ] = 0; + tg->last_io_disp[WRITE] = 0; +} + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ + unsigned long now = ktime_get_ns() >> 10; + unsigned long last_finish_time = tg->last_finish_time; + + if (now <= last_finish_time || last_finish_time == 0 || + last_finish_time == tg->checked_last_finish_time) + return; + + tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; + tg->checked_last_finish_time = last_finish_time; +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_update_latency_buckets(struct throtl_data *td) +{ + struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; + int i, cpu; + unsigned long last_latency = 0; + unsigned long latency; + + if (!blk_queue_nonrot(td->queue)) + return; + if (time_before(jiffies, td->last_calculate_time + HZ)) + return; + td->last_calculate_time = jiffies; + + memset(avg_latency, 0, sizeof(avg_latency)); + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets, cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } + + if (tmp->samples >= 32) { + int samples = tmp->samples; + + latency = tmp->total_latency; + + tmp->total_latency = 0; + tmp->samples = 0; + latency /= samples; + if (latency == 0) + continue; + avg_latency[i].latency = latency; + } + } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[i].latency) { + if (td->avg_buckets[i].latency < last_latency) + td->avg_buckets[i].latency = last_latency; + continue; + } + + if (!td->avg_buckets[i].valid) + latency = avg_latency[i].latency; + else + latency = (td->avg_buckets[i].latency * 7 + + avg_latency[i].latency) >> 3; + + td->avg_buckets[i].latency = max(latency, last_latency); + td->avg_buckets[i].valid = true; + last_latency = td->avg_buckets[i].latency; + } +} +#else +static inline void throtl_update_latency_buckets(struct throtl_data *td) +{ +} +#endif + bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { @@ -1399,6 +2038,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; + struct throtl_data *td = tg->td; + int ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1408,19 +2049,40 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, spin_lock_irq(q->queue_lock); + throtl_update_latency_buckets(td); + if (unlikely(blk_queue_bypass(q))) goto out_unlock; + ret = bio_associate_current(bio); +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (ret == 0 || ret == -EBUSY) + bio->bi_cg_private = tg; + blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); +#endif + blk_throtl_update_idletime(tg); + sq = &tg->service_queue; +again: while (true) { + if (tg->last_low_overflow_time[rw] == 0) + tg->last_low_overflow_time[rw] = jiffies; + throtl_downgrade_check(tg); + throtl_upgrade_check(tg); /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; /* if above limits, break to queue */ - if (!tg_may_dispatch(tg, bio, NULL)) + if (!tg_may_dispatch(tg, bio, NULL)) { + tg->last_low_overflow_time[rw] = jiffies; + if (throtl_can_upgrade(td, tg)) { + throtl_upgrade_state(td); + goto again; + } break; + } /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); @@ -1453,12 +2115,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], - tg->io_disp[rw], tg->iops[rw], + tg->bytes_disp[rw], bio->bi_iter.bi_size, + tg_bps_limit(tg, rw), + tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); - bio_associate_current(bio); - tg->td->nr_queued[rw]++; + tg->last_low_overflow_time[rw] = jiffies; + + td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; @@ -1483,9 +2147,94 @@ out: */ if (!throttled) bio_clear_flag(bio, BIO_THROTTLED); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (throttled || !td->track_bio_latency) + bio->bi_issue_stat.stat |= SKIP_LATENCY; +#endif return throttled; } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_track_latency(struct throtl_data *td, sector_t size, + int op, unsigned long time) +{ + struct latency_bucket *latency; + int index; + + if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + !blk_queue_nonrot(td->queue)) + return; + + index = request_bucket_index(size); + + latency = get_cpu_ptr(td->latency_buckets); + latency[index].total_latency += time; + latency[index].samples++; + put_cpu_ptr(td->latency_buckets); +} + +void blk_throtl_stat_add(struct request *rq, u64 time_ns) +{ + struct request_queue *q = rq->q; + struct throtl_data *td = q->td; + + throtl_track_latency(td, blk_stat_size(&rq->issue_stat), + req_op(rq), time_ns >> 10); +} + +void blk_throtl_bio_endio(struct bio *bio) +{ + struct throtl_grp *tg; + u64 finish_time_ns; + unsigned long finish_time; + unsigned long start_time; + unsigned long lat; + + tg = bio->bi_cg_private; + if (!tg) + return; + bio->bi_cg_private = NULL; + + finish_time_ns = ktime_get_ns(); + tg->last_finish_time = finish_time_ns >> 10; + + start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; + finish_time = __blk_stat_time(finish_time_ns) >> 10; + if (!start_time || finish_time <= start_time) + return; + + lat = finish_time - start_time; + /* this is only for bio based driver */ + if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) + throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), + bio_op(bio), lat); + + if (tg->latency_target) { + int bucket; + unsigned int threshold; + + bucket = request_bucket_index( + blk_stat_size(&bio->bi_issue_stat)); + threshold = tg->td->avg_buckets[bucket].latency + + tg->latency_target; + if (lat > threshold) + tg->bad_bio_cnt++; + /* + * Not race free, could get wrong count, which means cgroups + * will be throttled + */ + tg->bio_cnt++; + } + + if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { + tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; + tg->bio_cnt /= 2; + tg->bad_bio_cnt /= 2; + } +} +#endif + /* * Dispatch all bios from all children tg's queued on @parent_sq. On * return, @parent_sq is guaranteed to not have any active children tg's @@ -1558,6 +2307,12 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; + td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets) { + kfree(td); + return -ENOMEM; + } INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); @@ -1565,10 +2320,17 @@ int blk_throtl_init(struct request_queue *q) q->td = td; td->queue = q; + td->limit_valid[LIMIT_MAX] = true; + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->low_downgrade_time = jiffies; + /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); - if (ret) + if (ret) { + free_percpu(td->latency_buckets); kfree(td); + } return ret; } @@ -1577,9 +2339,74 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); + free_percpu(q->td->latency_buckets); kfree(q->td); } +void blk_throtl_register_queue(struct request_queue *q) +{ + struct throtl_data *td; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + td = q->td; + BUG_ON(!td); + + if (blk_queue_nonrot(q)) { + td->throtl_slice = DFL_THROTL_SLICE_SSD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; + } else { + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; + } +#ifndef CONFIG_BLK_DEV_THROTTLING_LOW + /* if no low limit, use previous default */ + td->throtl_slice = DFL_THROTL_SLICE_HD; +#endif + + td->track_bio_latency = !q->mq_ops && !q->request_fn; + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + + /* + * some tg are created before queue is fully initialized, eg, nonrot + * isn't initialized yet + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + tg->idletime_threshold = td->dft_idletime_threshold; + } + rcu_read_unlock(); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) +{ + if (!q->td) + return -EINVAL; + return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); +} + +ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long v; + unsigned long t; + + if (!q->td) + return -EINVAL; + if (kstrtoul(page, 10, &v)) + return -EINVAL; + t = msecs_to_jiffies(v); + if (t == 0 || t > MAX_THROTL_SLICE) + return -EINVAL; + q->td->throtl_slice = t; + return count; +} +#endif + static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1aedb1f7ee0c..ffa80e11cf14 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ - return stat[BLK_STAT_READ].nr_samples >= 1 && - stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; + return (stat[READ].nr_samples >= 1 && + stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) @@ -277,7 +277,7 @@ enum { LAT_EXCEEDED, }; -static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { struct backing_dev_info *bdi = rwb->queue->backing_dev_info; u64 thislat; @@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || - (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { + (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } @@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) * waited or still has writes in flights, consider us doing * just writes as well. */ - if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || - wb_recent_wait(rwb) || wbt_inflight(rwb)) + if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || + wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; } @@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) /* * If the 'min' latency exceeds our target, step down. */ - if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { - trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); + if (stat[READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } @@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_OK; } -static int latency_exceeded(struct rq_wb *rwb) -{ - struct blk_rq_stat stat[2]; - - blk_queue_stat_get(rwb->queue, stat); - return __latency_exceeded(rwb, stat); -} - static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { struct backing_dev_info *bdi = rwb->queue->backing_dev_info; @@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb) rwb->scale_step--; rwb->unknown_cnt = 0; - blk_stat_clear(rwb->queue); rwb->scaled_max = calc_wb_limits(rwb); @@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) rwb->scaled_max = false; rwb->unknown_cnt = 0; - blk_stat_clear(rwb->queue); calc_wb_limits(rwb); rwb_trace_step(rwb, "step down"); } static void rwb_arm_timer(struct rq_wb *rwb) { - unsigned long expires; - if (rwb->scale_step > 0) { /* * We should speed this up, using some variant of a fast @@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb) rwb->cur_win_nsec = rwb->win_nsec; } - expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); - mod_timer(&rwb->window_timer, expires); + blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); } -static void wb_timer_fn(unsigned long data) +static void wb_timer_fn(struct blk_stat_callback *cb) { - struct rq_wb *rwb = (struct rq_wb *) data; + struct rq_wb *rwb = cb->data; unsigned int inflight = wbt_inflight(rwb); int status; - status = latency_exceeded(rwb); + status = latency_exceeded(rwb, cb->stat); trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, inflight); @@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) __wbt_wait(rwb, bio->bi_opf, lock); - if (!timer_pending(&rwb->window_timer)) + if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); if (current_is_kswapd()) @@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q) struct rq_wb *rwb = q->rq_wb; if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { - del_timer_sync(&rwb->window_timer); + blk_stat_remove_callback(q, rwb->cb); rwb->win_nsec = rwb->min_lat_nsec = 0; wbt_update_limits(rwb); } @@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q) struct rq_wb *rwb; int i; - /* - * For now, we depend on the stats window being larger than - * our monitoring window. Ensure that this isn't inadvertently - * violated. - */ - BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return -ENOMEM; + } + for (i = 0; i < WBT_NUM_RWQ; i++) { atomic_set(&rwb->rq_wait[i].inflight, 0); init_waitqueue_head(&rwb->rq_wait[i].wait); } - setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); rwb->wc = 1; rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; @@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q) wbt_update_limits(rwb); /* - * Assign rwb, and turn on stats tracking for this queue + * Assign rwb and add the stats callback. */ q->rq_wb = rwb; - blk_stat_enable(q); + blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); @@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q) struct rq_wb *rwb = q->rq_wb; if (rwb) { - del_timer_sync(&rwb->window_timer); + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); q->rq_wb = NULL; kfree(rwb); } diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 65f1de519f67..ad6c78507c3a 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -32,27 +32,27 @@ enum { static inline void wbt_clear_state(struct blk_issue_stat *stat) { - stat->time &= BLK_STAT_TIME_MASK; + stat->stat &= ~BLK_STAT_RES_MASK; } static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) { - return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; + return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT; } static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) { - stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; + stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT; } static inline bool wbt_is_tracked(struct blk_issue_stat *stat) { - return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; + return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED; } static inline bool wbt_is_read(struct blk_issue_stat *stat) { - return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; + return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ; } struct rq_wait { @@ -81,7 +81,7 @@ struct rq_wb { u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ - struct timer_list window_timer; + struct blk_stat_callback *cb; s64 sync_issue; void *sync_cookie; diff --git a/block/blk.h b/block/blk.h index d1ea4bd9b9a3..07d375183f31 100644 --- a/block/blk.h +++ b/block/blk.h @@ -319,10 +319,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_register_queue(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_register_queue(struct request_queue *q) { } #endif /* CONFIG_BLK_DEV_THROTTLING */ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); +extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count); +extern void blk_throtl_bio_endio(struct bio *bio); +extern void blk_throtl_stat_add(struct request *rq, u64 time); +#else +static inline void blk_throtl_bio_endio(struct bio *bio) { } +static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } +#endif #endif /* BLK_INTERNAL_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 440b95ee593c..da69b079725f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; - bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; - nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) * spuriously on a newly created cic but there's no harm. */ if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) - return nonroot_cg; + return; /* * Drop reference to queues. New queues will be assigned in new @@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) } cic->blkcg_serial_nr = serial_nr; - return nonroot_cg; } #else -static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) +static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { - return false; } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const int rw = rq_data_dir(rq); const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - bool disable_wbt; spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); - disable_wbt = check_blkcg_changed(cic, bio); + check_blkcg_changed(cic, bio); new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { @@ -4491,9 +4486,6 @@ new_queue: rq->elv.priv[1] = cfqq->cfqg; spin_unlock_irq(q->queue_lock); - if (disable_wbt) - wbt_disable_default(q); - return 0; } @@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q) */ if (blk_queue_nonrot(q)) cfqd->cfq_slice_idle = 0; + wbt_disable_default(q); } /* diff --git a/block/genhd.c b/block/genhd.c index a9c516a8b37d..510aac1486cb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1352,7 +1352,7 @@ struct kobject *get_disk(struct gendisk *disk) owner = disk->fops->owner; if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&disk_to_dev(disk)->kobj); + kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); if (kobj == NULL) { module_put(owner); return NULL; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 2a2fc768b27a..82a43bb19967 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_free_cdb; bio = rq->bio; - rq->retries = 0; + req->retries = 0; start_time = jiffies; @@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, goto error; /* default. possible overriden later */ - rq->retries = 5; + req->retries = 5; switch (opcode) { case SEND_DIAGNOSTIC: case FORMAT_UNIT: rq->timeout = FORMAT_UNIT_TIMEOUT; - rq->retries = 1; + req->retries = 1; break; case START_STOP: rq->timeout = START_STOP_TIMEOUT; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; case READ_DEFECT_DATA: rq->timeout = READ_DEFECT_DATA_TIMEOUT; - rq->retries = 1; + req->retries = 1; break; default: rq->timeout = BLK_DEFAULT_SG_TIMEOUT; diff --git a/block/sed-opal.c b/block/sed-opal.c index 14035f826b5e..6736c7873d4a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1831,7 +1831,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev) /* 0x08 is Manufacured Inactive */ /* 0x09 is Manufactured */ if (lc_status != OPAL_MANUFACTURED_INACTIVE) { - pr_err("Couldn't determine the status of the Lifcycle state\n"); + pr_err("Couldn't determine the status of the Lifecycle state\n"); return -ENODEV; } diff --git a/block/t10-pi.c b/block/t10-pi.c index 2c97912335a9..680c6d636298 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity_profile t10_pi_type1_crc = { +const struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity_profile t10_pi_type1_ip = { +const struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity_profile t10_pi_type3_crc = { +const struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity_profile t10_pi_type3_ip = { +const struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f744de7a0f9b..a1c2e816128f 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -514,18 +514,6 @@ config VIRTIO_BLK_SCSI virtio protocol and not enabled by default by any hypervisor. Your probably want to virtio-scsi instead. -config BLK_DEV_HD - bool "Very old hard disk (MFM/RLL/IDE) driver" - depends on HAVE_IDE - depends on !ARM || ARCH_RPC || BROKEN - help - This is a very old hard disk driver that lacks the enhanced - functionality of the newer ones. - - It is required for systems with ancient MFM/RLL/ESDI drives. - - If unsure, say N. - config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 1e9661e26f29..b12c772bbeb3 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o -obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 45b4384f650c..ce102ec47ef2 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4207,9 +4207,7 @@ static int __init do_floppy_init(void) disks[drive]->fops = &floppy_fops; sprintf(disks[drive]->disk_name, "fd%d", drive); - init_timer(&motor_off_timer[drive]); - motor_off_timer[drive].data = drive; - motor_off_timer[drive].function = motor_off_callback; + setup_timer(&motor_off_timer[drive], motor_off_callback, drive); } err = register_blkdev(FLOPPY_MAJOR, "fd"); diff --git a/drivers/block/hd.c b/drivers/block/hd.c deleted file mode 100644 index 6043648da1e8..000000000000 --- a/drivers/block/hd.c +++ /dev/null @@ -1,803 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This is the low-level hd interrupt support. It traverses the - * request-list, using interrupts to jump between functions. As - * all the functions are called within interrupts, we may not - * sleep. Special care is recommended. - * - * modified by Drew Eckhardt to check nr of hd's from the CMOS. - * - * Thanks to Branko Lankester, [email protected], who found a bug - * in the early extended-partition checks and added DM partitions - * - * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads", - * and general streamlining by Mark Lord. - * - * Removed 99% of above. Use Mark's ide driver for those options. - * This is now a lightweight ST-506 driver. (Paul Gortmaker) - * - * Modified 1995 Russell King for ARM processor. - * - * Bugfix: max_sectors must be <= 255 or the wheels tend to come - * off in a hurry once you queue things up - Paul G. 02/2001 - */ - -/* Uncomment the following if you want verbose error reports. */ -/* #define VERBOSE_ERRORS */ - -#include <linux/blkdev.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/interrupt.h> -#include <linux/timer.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/genhd.h> -#include <linux/string.h> -#include <linux/ioport.h> -#include <linux/init.h> -#include <linux/blkpg.h> -#include <linux/ata.h> -#include <linux/hdreg.h> - -#define HD_IRQ 14 - -#define REALLY_SLOW_IO -#include <asm/io.h> -#include <linux/uaccess.h> - -#ifdef __arm__ -#undef HD_IRQ -#endif -#include <asm/irq.h> -#ifdef __arm__ -#define HD_IRQ IRQ_HARDDISK -#endif - -/* Hd controller regster ports */ - -#define HD_DATA 0x1f0 /* _CTL when writing */ -#define HD_ERROR 0x1f1 /* see err-bits */ -#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */ -#define HD_SECTOR 0x1f3 /* starting sector */ -#define HD_LCYL 0x1f4 /* starting cylinder */ -#define HD_HCYL 0x1f5 /* high byte of starting cyl */ -#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */ -#define HD_STATUS 0x1f7 /* see status-bits */ -#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */ -#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */ -#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */ - -#define HD_CMD 0x3f6 /* used for resets */ -#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */ - -/* Bits of HD_STATUS */ -#define ERR_STAT 0x01 -#define INDEX_STAT 0x02 -#define ECC_STAT 0x04 /* Corrected error */ -#define DRQ_STAT 0x08 -#define SEEK_STAT 0x10 -#define SERVICE_STAT SEEK_STAT -#define WRERR_STAT 0x20 -#define READY_STAT 0x40 -#define BUSY_STAT 0x80 - -/* Bits for HD_ERROR */ -#define MARK_ERR 0x01 /* Bad address mark */ -#define TRK0_ERR 0x02 /* couldn't find track 0 */ -#define ABRT_ERR 0x04 /* Command aborted */ -#define MCR_ERR 0x08 /* media change request */ -#define ID_ERR 0x10 /* ID field not found */ -#define MC_ERR 0x20 /* media changed */ -#define ECC_ERR 0x40 /* Uncorrectable ECC error */ -#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */ -#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */ - -static DEFINE_SPINLOCK(hd_lock); -static struct request_queue *hd_queue; -static struct request *hd_req; - -#define TIMEOUT_VALUE (6*HZ) -#define HD_DELAY 0 - -#define MAX_ERRORS 16 /* Max read/write errors/sector */ -#define RESET_FREQ 8 /* Reset controller every 8th retry */ -#define RECAL_FREQ 4 /* Recalibrate every 4th retry */ -#define MAX_HD 2 - -#define STAT_OK (READY_STAT|SEEK_STAT) -#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK) - -static void recal_intr(void); -static void bad_rw_intr(void); - -static int reset; -static int hd_error; - -/* - * This struct defines the HD's and their types. - */ -struct hd_i_struct { - unsigned int head, sect, cyl, wpcom, lzone, ctl; - int unit; - int recalibrate; - int special_op; -}; - -#ifdef HD_TYPE -static struct hd_i_struct hd_info[] = { HD_TYPE }; -static int NR_HD = ARRAY_SIZE(hd_info); -#else -static struct hd_i_struct hd_info[MAX_HD]; -static int NR_HD; -#endif - -static struct gendisk *hd_gendisk[MAX_HD]; - -static struct timer_list device_timer; - -#define TIMEOUT_VALUE (6*HZ) - -#define SET_TIMER \ - do { \ - mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \ - } while (0) - -static void (*do_hd)(void) = NULL; -#define SET_HANDLER(x) \ -if ((do_hd = (x)) != NULL) \ - SET_TIMER; \ -else \ - del_timer(&device_timer); - - -#if (HD_DELAY > 0) - -#include <linux/i8253.h> - -unsigned long last_req; - -unsigned long read_timer(void) -{ - unsigned long t, flags; - int i; - - raw_spin_lock_irqsave(&i8253_lock, flags); - t = jiffies * 11932; - outb_p(0, 0x43); - i = inb_p(0x40); - i |= inb(0x40) << 8; - raw_spin_unlock_irqrestore(&i8253_lock, flags); - return(t - i); -} -#endif - -static void __init hd_setup(char *str, int *ints) -{ - int hdind = 0; - - if (ints[0] != 3) - return; - if (hd_info[0].head != 0) - hdind = 1; - hd_info[hdind].head = ints[2]; - hd_info[hdind].sect = ints[3]; - hd_info[hdind].cyl = ints[1]; - hd_info[hdind].wpcom = 0; - hd_info[hdind].lzone = ints[1]; - hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0); - NR_HD = hdind+1; -} - -static bool hd_end_request(int err, unsigned int bytes) -{ - if (__blk_end_request(hd_req, err, bytes)) - return true; - hd_req = NULL; - return false; -} - -static bool hd_end_request_cur(int err) -{ - return hd_end_request(err, blk_rq_cur_bytes(hd_req)); -} - -static void dump_status(const char *msg, unsigned int stat) -{ - char *name = "hd?"; - if (hd_req) - name = hd_req->rq_disk->disk_name; - -#ifdef VERBOSE_ERRORS - printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff); - if (stat & BUSY_STAT) printk("Busy "); - if (stat & READY_STAT) printk("DriveReady "); - if (stat & WRERR_STAT) printk("WriteFault "); - if (stat & SEEK_STAT) printk("SeekComplete "); - if (stat & DRQ_STAT) printk("DataRequest "); - if (stat & ECC_STAT) printk("CorrectedError "); - if (stat & INDEX_STAT) printk("Index "); - if (stat & ERR_STAT) printk("Error "); - printk("}\n"); - if ((stat & ERR_STAT) == 0) { - hd_error = 0; - } else { - hd_error = inb(HD_ERROR); - printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff); - if (hd_error & BBD_ERR) printk("BadSector "); - if (hd_error & ECC_ERR) printk("UncorrectableError "); - if (hd_error & ID_ERR) printk("SectorIdNotFound "); - if (hd_error & ABRT_ERR) printk("DriveStatusError "); - if (hd_error & TRK0_ERR) printk("TrackZeroNotFound "); - if (hd_error & MARK_ERR) printk("AddrMarkNotFound "); - printk("}"); - if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) { - printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL), - inb(HD_CURRENT) & 0xf, inb(HD_SECTOR)); - if (hd_req) - printk(", sector=%ld", blk_rq_pos(hd_req)); - } - printk("\n"); - } -#else - printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff); - if ((stat & ERR_STAT) == 0) { - hd_error = 0; - } else { - hd_error = inb(HD_ERROR); - printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff); - } -#endif -} - -static void check_status(void) -{ - int i = inb_p(HD_STATUS); - - if (!OK_STATUS(i)) { - dump_status("check_status", i); - bad_rw_intr(); - } -} - -static int controller_busy(void) -{ - int retries = 100000; - unsigned char status; - - do { - status = inb_p(HD_STATUS); - } while ((status & BUSY_STAT) && --retries); - return status; -} - -static int status_ok(void) -{ - unsigned char status = inb_p(HD_STATUS); - - if (status & BUSY_STAT) - return 1; /* Ancient, but does it make sense??? */ - if (status & WRERR_STAT) - return 0; - if (!(status & READY_STAT)) - return 0; - if (!(status & SEEK_STAT)) - return 0; - return 1; -} - -static int controller_ready(unsigned int drive, unsigned int head) -{ - int retry = 100; - - do { - if (controller_busy() & BUSY_STAT) - return 0; - outb_p(0xA0 | (drive<<4) | head, HD_CURRENT); - if (status_ok()) - return 1; - } while (--retry); - return 0; -} - -static void hd_out(struct hd_i_struct *disk, - unsigned int nsect, - unsigned int sect, - unsigned int head, - unsigned int cyl, - unsigned int cmd, - void (*intr_addr)(void)) -{ - unsigned short port; - -#if (HD_DELAY > 0) - while (read_timer() - last_req < HD_DELAY) - /* nothing */; -#endif - if (reset) - return; - if (!controller_ready(disk->unit, head)) { - reset = 1; - return; - } - SET_HANDLER(intr_addr); - outb_p(disk->ctl, HD_CMD); - port = HD_DATA; - outb_p(disk->wpcom >> 2, ++port); - outb_p(nsect, ++port); - outb_p(sect, ++port); - outb_p(cyl, ++port); - outb_p(cyl >> 8, ++port); - outb_p(0xA0 | (disk->unit << 4) | head, ++port); - outb_p(cmd, ++port); -} - -static void hd_request (void); - -static int drive_busy(void) -{ - unsigned int i; - unsigned char c; - - for (i = 0; i < 500000 ; i++) { - c = inb_p(HD_STATUS); - if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK) - return 0; - } - dump_status("reset timed out", c); - return 1; -} - -static void reset_controller(void) -{ - int i; - - outb_p(4, HD_CMD); - for (i = 0; i < 1000; i++) barrier(); - outb_p(hd_info[0].ctl & 0x0f, HD_CMD); - for (i = 0; i < 1000; i++) barrier(); - if (drive_busy()) - printk("hd: controller still busy\n"); - else if ((hd_error = inb(HD_ERROR)) != 1) - printk("hd: controller reset failed: %02x\n", hd_error); -} - -static void reset_hd(void) -{ - static int i; - -repeat: - if (reset) { - reset = 0; - i = -1; - reset_controller(); - } else { - check_status(); - if (reset) - goto repeat; - } - if (++i < NR_HD) { - struct hd_i_struct *disk = &hd_info[i]; - disk->special_op = disk->recalibrate = 1; - hd_out(disk, disk->sect, disk->sect, disk->head-1, - disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd); - if (reset) - goto repeat; - } else - hd_request(); -} - -/* - * Ok, don't know what to do with the unexpected interrupts: on some machines - * doing a reset and a retry seems to result in an eternal loop. Right now I - * ignore it, and just set the timeout. - * - * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the - * drive enters "idle", "standby", or "sleep" mode, so if the status looks - * "good", we just ignore the interrupt completely. - */ -static void unexpected_hd_interrupt(void) -{ - unsigned int stat = inb_p(HD_STATUS); - - if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) { - dump_status("unexpected interrupt", stat); - SET_TIMER; - } -} - -/* - * bad_rw_intr() now tries to be a bit smarter and does things - * according to the error returned by the controller. - * -Mika Liljeberg ([email protected]) - */ -static void bad_rw_intr(void) -{ - struct request *req = hd_req; - - if (req != NULL) { - struct hd_i_struct *disk = req->rq_disk->private_data; - if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) { - hd_end_request_cur(-EIO); - disk->special_op = disk->recalibrate = 1; - } else if (req->errors % RESET_FREQ == 0) - reset = 1; - else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0) - disk->special_op = disk->recalibrate = 1; - /* Otherwise just retry */ - } -} - -static inline int wait_DRQ(void) -{ - int retries; - int stat; - - for (retries = 0; retries < 100000; retries++) { - stat = inb_p(HD_STATUS); - if (stat & DRQ_STAT) - return 0; - } - dump_status("wait_DRQ", stat); - return -1; -} - -static void read_intr(void) -{ - struct request *req; - int i, retries = 100000; - - do { - i = (unsigned) inb_p(HD_STATUS); - if (i & BUSY_STAT) - continue; - if (!OK_STATUS(i)) - break; - if (i & DRQ_STAT) - goto ok_to_read; - } while (--retries > 0); - dump_status("read_intr", i); - bad_rw_intr(); - hd_request(); - return; - -ok_to_read: - req = hd_req; - insw(HD_DATA, bio_data(req->bio), 256); -#ifdef DEBUG - printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", - req->rq_disk->disk_name, blk_rq_pos(req) + 1, - blk_rq_sectors(req) - 1, bio_data(req->bio)+512); -#endif - if (hd_end_request(0, 512)) { - SET_HANDLER(&read_intr); - return; - } - - (void) inb_p(HD_STATUS); -#if (HD_DELAY > 0) - last_req = read_timer(); -#endif - hd_request(); -} - -static void write_intr(void) -{ - struct request *req = hd_req; - int i; - int retries = 100000; - - do { - i = (unsigned) inb_p(HD_STATUS); - if (i & BUSY_STAT) - continue; - if (!OK_STATUS(i)) - break; - if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT)) - goto ok_to_write; - } while (--retries > 0); - dump_status("write_intr", i); - bad_rw_intr(); - hd_request(); - return; - -ok_to_write: - if (hd_end_request(0, 512)) { - SET_HANDLER(&write_intr); - outsw(HD_DATA, bio_data(req->bio), 256); - return; - } - -#if (HD_DELAY > 0) - last_req = read_timer(); -#endif - hd_request(); -} - -static void recal_intr(void) -{ - check_status(); -#if (HD_DELAY > 0) - last_req = read_timer(); -#endif - hd_request(); -} - -/* - * This is another of the error-routines I don't know what to do with. The - * best idea seems to just set reset, and start all over again. - */ -static void hd_times_out(unsigned long dummy) -{ - char *name; - - do_hd = NULL; - - if (!hd_req) - return; - - spin_lock_irq(hd_queue->queue_lock); - reset = 1; - name = hd_req->rq_disk->disk_name; - printk("%s: timeout\n", name); - if (++hd_req->errors >= MAX_ERRORS) { -#ifdef DEBUG - printk("%s: too many errors\n", name); -#endif - hd_end_request_cur(-EIO); - } - hd_request(); - spin_unlock_irq(hd_queue->queue_lock); -} - -static int do_special_op(struct hd_i_struct *disk, struct request *req) -{ - if (disk->recalibrate) { - disk->recalibrate = 0; - hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr); - return reset; - } - if (disk->head > 16) { - printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name); - hd_end_request_cur(-EIO); - } - disk->special_op = 0; - return 1; -} - -/* - * The driver enables interrupts as much as possible. In order to do this, - * (a) the device-interrupt is disabled before entering hd_request(), - * and (b) the timeout-interrupt is disabled before the sti(). - * - * Interrupts are still masked (by default) whenever we are exchanging - * data/cmds with a drive, because some drives seem to have very poor - * tolerance for latency during I/O. The IDE driver has support to unmask - * interrupts for non-broken hardware, so use that driver if required. - */ -static void hd_request(void) -{ - unsigned int block, nsect, sec, track, head, cyl; - struct hd_i_struct *disk; - struct request *req; - - if (do_hd) - return; -repeat: - del_timer(&device_timer); - - if (!hd_req) { - hd_req = blk_fetch_request(hd_queue); - if (!hd_req) { - do_hd = NULL; - return; - } - } - req = hd_req; - - if (reset) { - reset_hd(); - return; - } - disk = req->rq_disk->private_data; - block = blk_rq_pos(req); - nsect = blk_rq_sectors(req); - if (block >= get_capacity(req->rq_disk) || - ((block+nsect) > get_capacity(req->rq_disk))) { - printk("%s: bad access: block=%d, count=%d\n", - req->rq_disk->disk_name, block, nsect); - hd_end_request_cur(-EIO); - goto repeat; - } - - if (disk->special_op) { - if (do_special_op(disk, req)) - goto repeat; - return; - } - sec = block % disk->sect + 1; - track = block / disk->sect; - head = track % disk->head; - cyl = track / disk->head; -#ifdef DEBUG - printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", - req->rq_disk->disk_name, - req_data_dir(req) == READ ? "read" : "writ", - cyl, head, sec, nsect, bio_data(req->bio)); -#endif - - switch (req_op(req)) { - case REQ_OP_READ: - hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ, - &read_intr); - if (reset) - goto repeat; - break; - case REQ_OP_WRITE: - hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE, - &write_intr); - if (reset) - goto repeat; - if (wait_DRQ()) { - bad_rw_intr(); - goto repeat; - } - outsw(HD_DATA, bio_data(req->bio), 256); - break; - default: - printk("unknown hd-command\n"); - hd_end_request_cur(-EIO); - break; - } -} - -static void do_hd_request(struct request_queue *q) -{ - hd_request(); -} - -static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct hd_i_struct *disk = bdev->bd_disk->private_data; - - geo->heads = disk->head; - geo->sectors = disk->sect; - geo->cylinders = disk->cyl; - return 0; -} - -/* - * Releasing a block device means we sync() it, so that it can safely - * be forgotten about... - */ - -static irqreturn_t hd_interrupt(int irq, void *dev_id) -{ - void (*handler)(void) = do_hd; - - spin_lock(hd_queue->queue_lock); - - do_hd = NULL; - del_timer(&device_timer); - if (!handler) - handler = unexpected_hd_interrupt; - handler(); - - spin_unlock(hd_queue->queue_lock); - - return IRQ_HANDLED; -} - -static const struct block_device_operations hd_fops = { - .getgeo = hd_getgeo, -}; - -static int __init hd_init(void) -{ - int drive; - - if (register_blkdev(HD_MAJOR, "hd")) - return -1; - - hd_queue = blk_init_queue(do_hd_request, &hd_lock); - if (!hd_queue) { - unregister_blkdev(HD_MAJOR, "hd"); - return -ENOMEM; - } - - blk_queue_max_hw_sectors(hd_queue, 255); - init_timer(&device_timer); - device_timer.function = hd_times_out; - blk_queue_logical_block_size(hd_queue, 512); - - if (!NR_HD) { - /* - * We don't know anything about the drive. This means - * that you *MUST* specify the drive parameters to the - * kernel yourself. - * - * If we were on an i386, we used to read this info from - * the BIOS or CMOS. This doesn't work all that well, - * since this assumes that this is a primary or secondary - * drive, and if we're using this legacy driver, it's - * probably an auxiliary controller added to recover - * legacy data off an ST-506 drive. Either way, it's - * definitely safest to have the user explicitly specify - * the information. - */ - printk("hd: no drives specified - use hd=cyl,head,sectors" - " on kernel command line\n"); - goto out; - } - - for (drive = 0 ; drive < NR_HD ; drive++) { - struct gendisk *disk = alloc_disk(64); - struct hd_i_struct *p = &hd_info[drive]; - if (!disk) - goto Enomem; - disk->major = HD_MAJOR; - disk->first_minor = drive << 6; - disk->fops = &hd_fops; - sprintf(disk->disk_name, "hd%c", 'a'+drive); - disk->private_data = p; - set_capacity(disk, p->head * p->sect * p->cyl); - disk->queue = hd_queue; - p->unit = drive; - hd_gendisk[drive] = disk; - printk("%s: %luMB, CHS=%d/%d/%d\n", - disk->disk_name, (unsigned long)get_capacity(disk)/2048, - p->cyl, p->head, p->sect); - } - - if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) { - printk("hd: unable to get IRQ%d for the hard disk driver\n", - HD_IRQ); - goto out1; - } - if (!request_region(HD_DATA, 8, "hd")) { - printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA); - goto out2; - } - if (!request_region(HD_CMD, 1, "hd(cmd)")) { - printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD); - goto out3; - } - - /* Let them fly */ - for (drive = 0; drive < NR_HD; drive++) - add_disk(hd_gendisk[drive]); - - return 0; - -out3: - release_region(HD_DATA, 8); -out2: - free_irq(HD_IRQ, NULL); -out1: - for (drive = 0; drive < NR_HD; drive++) - put_disk(hd_gendisk[drive]); - NR_HD = 0; -out: - del_timer(&device_timer); - unregister_blkdev(HD_MAJOR, "hd"); - blk_cleanup_queue(hd_queue); - return -1; -Enomem: - while (drive--) - put_disk(hd_gendisk[drive]); - goto out; -} - -static int __init parse_hd_setup(char *line) -{ - int ints[6]; - - (void) get_options(line, ARRAY_SIZE(ints), ints); - hd_setup(NULL, ints); - - return 1; -} -__setup("hd=", parse_hd_setup); - -late_initcall(hd_init); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 0ecb6461ed81..cc981f34e017 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1710,7 +1710,7 @@ static int loop_init_request(void *data, struct request *rq, return 0; } -static struct blk_mq_ops loop_mq_ops = { +static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, .init_request = loop_init_request, }; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 286f276f586e..e88e7b06c616 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -989,9 +989,7 @@ static int mg_probe(struct platform_device *plat_dev) blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS); blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); - init_timer(&host->timer); - host->timer.function = mg_times_out; - host->timer.data = (unsigned long)host; + setup_timer(&host->timer, mg_times_out, (unsigned long)host); host->gd = alloc_disk(MG_DISK_MAX_PART); if (!host->gd) { diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f96ab717534c..30076e7753bc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3889,7 +3889,7 @@ exit_handler: return BLK_EH_RESET_TIMER; } -static struct blk_mq_ops mtip_mq_ops = { +static const struct blk_mq_ops mtip_mq_ops = { .queue_rq = mtip_queue_rq, .init_request = mtip_init_cmd, .exit_request = mtip_free_cmd, @@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd) dev_info(&dd->pdev->dev, "device %s surprise removal\n", dd->disk->disk_name); - blk_mq_freeze_queue_start(dd->queue); + blk_freeze_queue_start(dd->queue); blk_mq_stop_hw_queues(dd->queue); blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d8a23561b4cb..03ae72985c79 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1035,7 +1035,7 @@ static int nbd_init_request(void *data, struct request *rq, return 0; } -static struct blk_mq_ops nbd_mq_ops = { +static const struct blk_mq_ops nbd_mq_ops = { .queue_rq = nbd_queue_rq, .init_request = nbd_init_request, .timeout = nbd_xmit_timeout, diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 6f2e565bccc5..f93906ff31e8 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -117,6 +117,10 @@ static bool use_lightnvm; module_param(use_lightnvm, bool, S_IRUGO); MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); +static bool blocking; +module_param(blocking, bool, S_IRUGO); +MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); + static int irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -357,6 +361,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + if (irqmode == NULL_IRQ_TIMER) { hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; @@ -392,7 +398,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, return 0; } -static struct blk_mq_ops null_mq_ops = { +static const struct blk_mq_ops null_mq_ops = { .queue_rq = null_queue_rq, .init_hctx = null_init_hctx, .complete = null_softirq_done_fn, @@ -724,6 +730,9 @@ static int null_add_dev(void) nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; nullb->tag_set.driver_data = nullb; + if (blocking) + nullb->tag_set.flags |= BLK_MQ_F_BLOCKING; + rv = blk_mq_alloc_tag_set(&nullb->tag_set); if (rv) goto out_cleanup_queues; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 939641d6e262..b1267ef34d5a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -300,6 +300,11 @@ static void pcd_init_units(void) struct gendisk *disk = alloc_disk(1); if (!disk) continue; + disk->queue = blk_init_queue(do_pcd_request, &pcd_lock); + if (!disk->queue) { + put_disk(disk); + continue; + } cd->disk = disk; cd->pi = &cd->pia; cd->present = 0; @@ -735,18 +740,36 @@ static int pcd_detect(void) } /* I/O request processing */ -static struct request_queue *pcd_queue; +static int pcd_queue; + +static int set_next_request(void) +{ + struct pcd_unit *cd; + struct request_queue *q; + int old_pos = pcd_queue; + + do { + cd = &pcd[pcd_queue]; + q = cd->present ? cd->disk->queue : NULL; + if (++pcd_queue == PCD_UNITS) + pcd_queue = 0; + if (q) { + pcd_req = blk_fetch_request(q); + if (pcd_req) + break; + } + } while (pcd_queue != old_pos); + + return pcd_req != NULL; +} -static void do_pcd_request(struct request_queue * q) +static void pcd_request(void) { if (pcd_busy) return; while (1) { - if (!pcd_req) { - pcd_req = blk_fetch_request(q); - if (!pcd_req) - return; - } + if (!pcd_req && !set_next_request()) + return; if (rq_data_dir(pcd_req) == READ) { struct pcd_unit *cd = pcd_req->rq_disk->private_data; @@ -766,6 +789,11 @@ static void do_pcd_request(struct request_queue * q) } } +static void do_pcd_request(struct request_queue *q) +{ + pcd_request(); +} + static inline void next_request(int err) { unsigned long saved_flags; @@ -774,7 +802,7 @@ static inline void next_request(int err) if (!__blk_end_request_cur(pcd_req, err)) pcd_req = NULL; pcd_busy = 0; - do_pcd_request(pcd_queue); + pcd_request(); spin_unlock_irqrestore(&pcd_lock, saved_flags); } @@ -849,7 +877,7 @@ static void do_pcd_read_drq(void) do_pcd_read(); spin_lock_irqsave(&pcd_lock, saved_flags); - do_pcd_request(pcd_queue); + pcd_request(); spin_unlock_irqrestore(&pcd_lock, saved_flags); } @@ -957,19 +985,10 @@ static int __init pcd_init(void) return -EBUSY; } - pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock); - if (!pcd_queue) { - unregister_blkdev(major, name); - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) - put_disk(cd->disk); - return -ENOMEM; - } - for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { if (cd->present) { register_cdrom(&cd->info); cd->disk->private_data = cd; - cd->disk->queue = pcd_queue; add_disk(cd->disk); } } @@ -988,9 +1007,9 @@ static void __exit pcd_exit(void) pi_release(cd->pi); unregister_cdrom(&cd->info); } + blk_cleanup_queue(cd->disk->queue); put_disk(cd->disk); } - blk_cleanup_queue(pcd_queue); unregister_blkdev(major, name); pi_unregister_driver(par_drv); } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 9cfd2e06a649..b05e151c9b38 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -381,12 +381,33 @@ static enum action do_pd_write_start(void); static enum action do_pd_read_drq(void); static enum action do_pd_write_done(void); -static struct request_queue *pd_queue; +static int pd_queue; static int pd_claimed; static struct pd_unit *pd_current; /* current request's drive */ static PIA *pi_current; /* current request's PIA */ +static int set_next_request(void) +{ + struct gendisk *disk; + struct request_queue *q; + int old_pos = pd_queue; + + do { + disk = pd[pd_queue].gd; + q = disk ? disk->queue : NULL; + if (++pd_queue == PD_UNITS) + pd_queue = 0; + if (q) { + pd_req = blk_fetch_request(q); + if (pd_req) + break; + } + } while (pd_queue != old_pos); + + return pd_req != NULL; +} + static void run_fsm(void) { while (1) { @@ -418,8 +439,7 @@ static void run_fsm(void) spin_lock_irqsave(&pd_lock, saved_flags); if (!__blk_end_request_cur(pd_req, res == Ok ? 0 : -EIO)) { - pd_req = blk_fetch_request(pd_queue); - if (!pd_req) + if (!set_next_request()) stop = 1; } spin_unlock_irqrestore(&pd_lock, saved_flags); @@ -839,7 +859,13 @@ static void pd_probe_drive(struct pd_unit *disk) p->first_minor = (disk - pd) << PD_BITS; disk->gd = p; p->private_data = disk; - p->queue = pd_queue; + p->queue = blk_init_queue(do_pd_request, &pd_lock); + if (!p->queue) { + disk->gd = NULL; + put_disk(p); + return; + } + blk_queue_max_hw_sectors(p->queue, cluster); if (disk->drive == -1) { for (disk->drive = 0; disk->drive <= 1; disk->drive++) @@ -919,26 +945,18 @@ static int __init pd_init(void) if (disable) goto out1; - pd_queue = blk_init_queue(do_pd_request, &pd_lock); - if (!pd_queue) - goto out1; - - blk_queue_max_hw_sectors(pd_queue, cluster); - if (register_blkdev(major, name)) - goto out2; + goto out1; printk("%s: %s version %s, major %d, cluster %d, nice %d\n", name, name, PD_VERSION, major, cluster, nice); if (!pd_detect()) - goto out3; + goto out2; return 0; -out3: - unregister_blkdev(major, name); out2: - blk_cleanup_queue(pd_queue); + unregister_blkdev(major, name); out1: return -ENODEV; } @@ -953,11 +971,11 @@ static void __exit pd_exit(void) if (p) { disk->gd = NULL; del_gendisk(p); + blk_cleanup_queue(p->queue); put_disk(p); pi_release(disk->pi); } } - blk_cleanup_queue(pd_queue); } MODULE_LICENSE("GPL"); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 14c5d32f5d8b..f24ca7315ddc 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -287,6 +287,12 @@ static void __init pf_init_units(void) struct gendisk *disk = alloc_disk(1); if (!disk) continue; + disk->queue = blk_init_queue(do_pf_request, &pf_spin_lock); + if (!disk->queue) { + put_disk(disk); + return; + } + blk_queue_max_segments(disk->queue, cluster); pf->disk = disk; pf->pi = &pf->pia; pf->media_status = PF_NM; @@ -772,7 +778,28 @@ static int pf_ready(void) return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask)); } -static struct request_queue *pf_queue; +static int pf_queue; + +static int set_next_request(void) +{ + struct pf_unit *pf; + struct request_queue *q; + int old_pos = pf_queue; + + do { + pf = &units[pf_queue]; + q = pf->present ? pf->disk->queue : NULL; + if (++pf_queue == PF_UNITS) + pf_queue = 0; + if (q) { + pf_req = blk_fetch_request(q); + if (pf_req) + break; + } + } while (pf_queue != old_pos); + + return pf_req != NULL; +} static void pf_end_request(int err) { @@ -780,16 +807,13 @@ static void pf_end_request(int err) pf_req = NULL; } -static void do_pf_request(struct request_queue * q) +static void pf_request(void) { if (pf_busy) return; repeat: - if (!pf_req) { - pf_req = blk_fetch_request(q); - if (!pf_req) - return; - } + if (!pf_req && !set_next_request()) + return; pf_current = pf_req->rq_disk->private_data; pf_block = blk_rq_pos(pf_req); @@ -817,6 +841,11 @@ repeat: } } +static void do_pf_request(struct request_queue *q) +{ + pf_request(); +} + static int pf_next_buf(void) { unsigned long saved_flags; @@ -846,7 +875,7 @@ static inline void next_request(int err) spin_lock_irqsave(&pf_spin_lock, saved_flags); pf_end_request(err); pf_busy = 0; - do_pf_request(pf_queue); + pf_request(); spin_unlock_irqrestore(&pf_spin_lock, saved_flags); } @@ -972,15 +1001,6 @@ static int __init pf_init(void) put_disk(pf->disk); return -EBUSY; } - pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock); - if (!pf_queue) { - unregister_blkdev(major, name); - for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) - put_disk(pf->disk); - return -ENOMEM; - } - - blk_queue_max_segments(pf_queue, cluster); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { struct gendisk *disk = pf->disk; @@ -988,7 +1008,6 @@ static int __init pf_init(void) if (!pf->present) continue; disk->private_data = pf; - disk->queue = pf_queue; add_disk(disk); } return 0; @@ -1003,10 +1022,10 @@ static void __exit pf_exit(void) if (!pf->present) continue; del_gendisk(pf->disk); + blk_cleanup_queue(pf->disk->queue); put_disk(pf->disk); pi_release(pf->pi); } - blk_cleanup_queue(pf_queue); } MODULE_LICENSE("GPL"); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 517838b65964..f24ade333e0c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4317,7 +4317,7 @@ static int rbd_init_request(void *data, struct request *rq, return 0; } -static struct blk_mq_ops rbd_mq_ops = { +static const struct blk_mq_ops rbd_mq_ops = { .queue_rq = rbd_queue_rq, .init_request = rbd_init_request, }; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index b5afd495d482..3064be6cf375 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -211,7 +211,7 @@ enum head { struct swim_priv { struct swim __iomem *base; spinlock_t lock; - struct request_queue *queue; + int fdc_queue; int floppy_count; struct floppy_state unit[FD_MAX_UNIT]; }; @@ -525,12 +525,33 @@ static int floppy_read_sectors(struct floppy_state *fs, return 0; } -static void redo_fd_request(struct request_queue *q) +static struct request *swim_next_request(struct swim_priv *swd) { + struct request_queue *q; + struct request *rq; + int old_pos = swd->fdc_queue; + + do { + q = swd->unit[swd->fdc_queue].disk->queue; + if (++swd->fdc_queue == swd->floppy_count) + swd->fdc_queue = 0; + if (q) { + rq = blk_fetch_request(q); + if (rq) + return rq; + } + } while (swd->fdc_queue != old_pos); + + return NULL; +} + +static void do_fd_request(struct request_queue *q) +{ + struct swim_priv *swd = q->queuedata; struct request *req; struct floppy_state *fs; - req = blk_fetch_request(q); + req = swim_next_request(swd); while (req) { int err = -EIO; @@ -554,15 +575,10 @@ static void redo_fd_request(struct request_queue *q) } done: if (!__blk_end_request_cur(req, err)) - req = blk_fetch_request(q); + req = swim_next_request(swd); } } -static void do_fd_request(struct request_queue *q) -{ - redo_fd_request(q); -} - static struct floppy_struct floppy_type[4] = { { 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */ { 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/ @@ -833,22 +849,25 @@ static int swim_floppy_init(struct swim_priv *swd) return -EBUSY; } + spin_lock_init(&swd->lock); + for (drive = 0; drive < swd->floppy_count; drive++) { swd->unit[drive].disk = alloc_disk(1); if (swd->unit[drive].disk == NULL) { err = -ENOMEM; goto exit_put_disks; } + swd->unit[drive].disk->queue = blk_init_queue(do_fd_request, + &swd->lock); + if (!swd->unit[drive].disk->queue) { + err = -ENOMEM; + put_disk(swd->unit[drive].disk); + goto exit_put_disks; + } + swd->unit[drive].disk->queue->queuedata = swd; swd->unit[drive].swd = swd; } - spin_lock_init(&swd->lock); - swd->queue = blk_init_queue(do_fd_request, &swd->lock); - if (!swd->queue) { - err = -ENOMEM; - goto exit_put_disks; - } - for (drive = 0; drive < swd->floppy_count; drive++) { swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE; swd->unit[drive].disk->major = FLOPPY_MAJOR; @@ -856,7 +875,6 @@ static int swim_floppy_init(struct swim_priv *swd) sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; swd->unit[drive].disk->private_data = &swd->unit[drive]; - swd->unit[drive].disk->queue = swd->queue; set_capacity(swd->unit[drive].disk, 2880); add_disk(swd->unit[drive].disk); } @@ -943,13 +961,12 @@ static int swim_remove(struct platform_device *dev) for (drive = 0; drive < swd->floppy_count; drive++) { del_gendisk(swd->unit[drive].disk); + blk_cleanup_queue(swd->unit[drive].disk->queue); put_disk(swd->unit[drive].disk); } unregister_blkdev(FLOPPY_MAJOR, "fd"); - blk_cleanup_queue(swd->queue); - /* eject floppies */ for (drive = 0; drive < swd->floppy_count; drive++) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1d4c9f8bc1e1..2d8290169271 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -597,7 +597,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set) return blk_mq_virtio_map_queues(set, vblk->vdev, 0); } -static struct blk_mq_ops virtio_mq_ops = { +static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .complete = virtblk_request_done, .init_request = virtblk_init_request, diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5067a0a952cb..d137ef8a72be 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -907,7 +907,7 @@ out_busy: return BLK_MQ_RQ_QUEUE_BUSY; } -static struct blk_mq_ops blkfront_mq_ops = { +static const struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, }; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 0b081d170087..d19af1d21f4c 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -762,7 +762,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_OK; } -static struct blk_mq_ops dm_mq_ops = { +static const struct blk_mq_ops dm_mq_ops = { .queue_rq = dm_mq_queue_rq, .complete = dm_softirq_done, .init_request = dm_mq_init_request, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dfb75979e455..cd93a3b9ceca 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - trace_block_bio_complete(md->queue, bio, io_error); bio->bi_error = io_error; bio_endio(bio); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ed5cd705b985..7aeb9691c2e1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi) rdev_dec_pending(rdev, conf->mddev); if (!error) { - trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), - raid_bi, 0); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index c80869e60909..51f2be8889b5 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -347,7 +347,7 @@ static int ubiblock_init_request(void *data, struct request *req, return 0; } -static struct blk_mq_ops ubiblock_mq_ops = { +static const struct blk_mq_ops ubiblock_mq_ops = { .queue_rq = ubiblock_queue_rq, .init_request = ubiblock_init_request, }; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9583a5f58a1d..14fbe9d80067 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -unsigned int nvme_max_retries = 5; -module_param_named(max_retries, nvme_max_retries, uint, 0644); +static u8 nvme_max_retries = 5; +module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -EXPORT_SYMBOL_GPL(nvme_max_retries); static int nvme_char_major; module_param(nvme_char_major, int, 0); @@ -67,6 +66,41 @@ static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; +static inline bool nvme_req_needs_retry(struct request *req) +{ + if (blk_noretry_request(req)) + return false; + if (req->errors & NVME_SC_DNR) + return false; + if (jiffies - req->start_time >= req->timeout) + return false; + if (nvme_req(req)->retries >= nvme_max_retries) + return false; + return true; +} + +void nvme_complete_rq(struct request *req) +{ + int error = 0; + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req)) { + nvme_req(req)->retries++; + blk_mq_requeue_request(req, + !blk_mq_queue_stopped(req->q)); + return; + } + + if (blk_rq_is_passthrough(req)) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + blk_mq_end_request(req, error); +} +EXPORT_SYMBOL_GPL(nvme_complete_rq); + void nvme_cancel_request(struct request *req, void *data, bool reserved) { int status; @@ -205,12 +239,6 @@ fail: return NULL; } -void nvme_requeue_req(struct request *req) -{ - blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); -} -EXPORT_SYMBOL_GPL(nvme_requeue_req); - struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid) { @@ -327,6 +355,11 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, { int ret = BLK_MQ_RQ_QUEUE_OK; + if (!(req->rq_flags & RQF_DONTPREP)) { + nvme_req(req)->retries = 0; + req->rq_flags |= RQF_DONTPREP; + } + switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: @@ -2386,7 +2419,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) - blk_mq_freeze_queue_start(ns->queue); + blk_freeze_queue_start(ns->queue); mutex_unlock(&ctrl->namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_start_freeze); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 5b7386f69f4d..990e6fb32a63 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) } EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); +bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) +{ + if (ctrl->opts->max_reconnects != -1 && + ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(nvmf_should_reconnect); + /** * nvmf_register_transport() - NVMe Fabrics Library registration function. * @ops: Transport ops instance to be registered to the @@ -533,6 +543,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_QUEUE_SIZE, "queue_size=%d" }, { NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" }, { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, + { NVMF_OPT_CTRL_LOSS_TMO, "ctrl_loss_tmo=%d" }, { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, @@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, char *options, *o, *p; int token, ret = 0; size_t nqnlen = 0; + int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->kato = token; break; + case NVMF_OPT_CTRL_LOSS_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token < 0) + pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); + ctrl_loss_tmo = token; + break; case NVMF_OPT_HOSTNQN: if (opts->host) { pr_err("hostnqn already user-assigned: %s\n", @@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } } + if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + if (!opts->host) { kref_get(&nvmf_default_host->ref); opts->host = nvmf_default_host; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 156018182ce4..f5a9c1fb186f 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -21,6 +21,8 @@ #define NVMF_MAX_QUEUE_SIZE 1024 #define NVMF_DEF_QUEUE_SIZE 128 #define NVMF_DEF_RECONNECT_DELAY 10 +/* default to 600 seconds of reconnect attempts before giving up */ +#define NVMF_DEF_CTRL_LOSS_TMO 600 /* * Define a host as seen by the target. We allocate one at boot, but also @@ -53,6 +55,7 @@ enum { NVMF_OPT_HOSTNQN = 1 << 8, NVMF_OPT_RECONNECT_DELAY = 1 << 9, NVMF_OPT_HOST_TRADDR = 1 << 10, + NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, }; /** @@ -77,6 +80,10 @@ enum { * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. + * @nr_reconnects: number of reconnect attempted since the last ctrl failure + * @max_reconnects: maximum number of allowed reconnect attempts before removing + * the controller, (-1) means reconnect forever, zero means remove + * immediately; */ struct nvmf_ctrl_options { unsigned mask; @@ -91,6 +98,8 @@ struct nvmf_ctrl_options { bool discovery_nqn; unsigned int kato; struct nvmf_host *host; + int nr_reconnects; + int max_reconnects; }; /* @@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); +bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9690beb15e69..fc42172c796a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -848,11 +848,12 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, /* validate the ACC response */ if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC) fcret = VERR_LSACC; - if (assoc_acc->hdr.desc_list_len != + else if (assoc_acc->hdr.desc_list_len != fcnvme_lsdesc_len( sizeof(struct fcnvme_ls_cr_assoc_acc))) fcret = VERR_CR_ASSOC_ACC_LEN; - if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST)) + else if (assoc_acc->hdr.rqst.desc_tag != + cpu_to_be32(FCNVME_LSDESC_RQST)) fcret = VERR_LSDESC_RQST; else if (assoc_acc->hdr.rqst.desc_len != fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst))) @@ -955,10 +956,10 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, /* validate the ACC response */ if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC) fcret = VERR_LSACC; - if (conn_acc->hdr.desc_list_len != + else if (conn_acc->hdr.desc_list_len != fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc))) fcret = VERR_CR_CONN_ACC_LEN; - if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST)) + else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST)) fcret = VERR_LSDESC_RQST; else if (conn_acc->hdr.rqst.desc_len != fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst))) @@ -1146,7 +1147,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) struct nvme_fc_ctrl *ctrl = op->ctrl; struct nvme_fc_queue *queue = op->queue; struct nvme_completion *cqe = &op->rsp_iu.cqe; - u16 status; + u16 status = NVME_SC_SUCCESS; /* * WARNING: @@ -1182,8 +1183,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) status = NVME_SC_ABORT_REQ | NVME_SC_DNR; - else - status = freq->status; + else if (freq->status) + status = NVME_SC_FC_TRANSPORT_ERROR; /* * For the linux implementation, if we have an unsuccesful @@ -1211,7 +1212,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) */ if (freq->transferred_length != be32_to_cpu(op->cmd_iu.data_len)) { - status = -EIO; + status = NVME_SC_FC_TRANSPORT_ERROR; goto done; } op->nreq.result.u64 = 0; @@ -1226,8 +1227,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) (freq->rcv_rsplen / 4) || be32_to_cpu(op->rsp_iu.xfrd_len) != freq->transferred_length || + op->rsp_iu.status_code || op->rqno != le16_to_cpu(cqe->command_id))) { - status = -EIO; + status = NVME_SC_FC_TRANSPORT_ERROR; goto done; } op->nreq.result = cqe->result; @@ -1235,7 +1237,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = -EIO; + status = NVME_SC_FC_TRANSPORT_ERROR; goto done; } @@ -1761,7 +1763,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, op->fcp_req.io_dir = io_dir; op->fcp_req.transferred_length = 0; op->fcp_req.rcv_rsplen = 0; - op->fcp_req.status = 0; + op->fcp_req.status = NVME_SC_SUCCESS; op->fcp_req.sqid = cpu_to_le16(queue->qnum); /* @@ -1923,32 +1925,18 @@ nvme_fc_complete_rq(struct request *rq) { struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); struct nvme_fc_ctrl *ctrl = op->ctrl; - int error = 0, state; + int state; state = atomic_xchg(&op->state, FCPOP_STATE_IDLE); nvme_cleanup_cmd(rq); - nvme_fc_unmap_data(ctrl, rq, op); - - if (unlikely(rq->errors)) { - if (nvme_req_needs_retry(rq, rq->errors)) { - nvme_requeue_req(rq); - return; - } - - if (blk_rq_is_passthrough(rq)) - error = rq->errors; - else - error = nvme_error_status(rq->errors); - } - + nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); - blk_mq_end_request(rq, error); } -static struct blk_mq_ops nvme_fc_mq_ops = { +static const struct blk_mq_ops nvme_fc_mq_ops = { .queue_rq = nvme_fc_queue_rq, .complete = nvme_fc_complete_rq, .init_request = nvme_fc_init_request, @@ -1959,7 +1947,7 @@ static struct blk_mq_ops nvme_fc_mq_ops = { .timeout = nvme_fc_timeout, }; -static struct blk_mq_ops nvme_fc_admin_mq_ops = { +static const struct blk_mq_ops nvme_fc_admin_mq_ops = { .queue_rq = nvme_fc_queue_rq, .complete = nvme_fc_complete_rq, .init_request = nvme_fc_init_admin_request, @@ -2546,11 +2534,20 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { + int ret; + nvme_fc_wq = create_workqueue("nvme_fc_wq"); if (!nvme_fc_wq) return -ENOMEM; - return nvmf_register_transport(&nvme_fc_transport); + ret = nvmf_register_transport(&nvme_fc_transport); + if (ret) + goto err; + + return 0; +err: + destroy_workqueue(nvme_fc_wq); + return ret; } static void __exit nvme_fc_exit_module(void) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2aa20e3e5675..9eecb67177df 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -43,8 +43,6 @@ extern unsigned char shutdown_timeout; #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 -extern unsigned int nvme_max_retries; - enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -92,6 +90,7 @@ enum nvme_quirks { struct nvme_request { struct nvme_command *cmd; union nvme_result result; + u8 retries; }; static inline struct nvme_request *nvme_req(struct request *req) @@ -261,13 +260,7 @@ static inline int nvme_error_status(u16 status) } } -static inline bool nvme_req_needs_retry(struct request *req, u16 status) -{ - return !(status & NVME_SC_DNR || blk_noretry_request(req)) && - (jiffies - req->start_time) < req->timeout && - req->retries < nvme_max_retries; -} - +void nvme_complete_rq(struct request *req); void nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); @@ -302,7 +295,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid); -void nvme_requeue_req(struct request *req); int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 26a5fd05fe88..3818ab15a26e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -326,10 +326,6 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->nents = 0; iod->length = size; - if (!(rq->rq_flags & RQF_DONTPREP)) { - rq->retries = 0; - rq->rq_flags |= RQF_DONTPREP; - } return BLK_MQ_RQ_QUEUE_OK; } @@ -628,34 +624,12 @@ out_free_cmd: return ret; } -static void nvme_complete_rq(struct request *req) +static void nvme_pci_complete_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dev *dev = iod->nvmeq->dev; - int error = 0; - - nvme_unmap_data(dev, req); - - if (unlikely(req->errors)) { - if (nvme_req_needs_retry(req, req->errors)) { - req->retries++; - nvme_requeue_req(req); - return; - } - - if (blk_rq_is_passthrough(req)) - error = req->errors; - else - error = nvme_error_status(req->errors); - } - - if (unlikely(iod->aborted)) { - dev_warn(dev->ctrl.device, - "completing aborted command with status: %04x\n", - req->errors); - } - blk_mq_end_request(req, error); + nvme_unmap_data(iod->nvmeq->dev, req); + nvme_complete_rq(req); } /* We read the CQE phase first to check if the rest of the entry is valid */ @@ -1129,18 +1103,18 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; } -static struct blk_mq_ops nvme_mq_admin_ops = { +static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, - .complete = nvme_complete_rq, + .complete = nvme_pci_complete_rq, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, .init_request = nvme_admin_init_request, .timeout = nvme_timeout, }; -static struct blk_mq_ops nvme_mq_ops = { +static const struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, - .complete = nvme_complete_rq, + .complete = nvme_pci_complete_rq, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, .map_queues = nvme_pci_map_queues, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 47a479f26e5d..4aae363943e3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -34,7 +34,7 @@ #include "fabrics.h" -#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */ +#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ #define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */ @@ -118,7 +118,6 @@ struct nvme_rdma_ctrl { struct nvme_rdma_qe async_event_sqe; - int reconnect_delay; struct delayed_work reconnect_work; struct list_head list; @@ -129,14 +128,8 @@ struct nvme_rdma_ctrl { u64 cap; u32 max_fr_pages; - union { - struct sockaddr addr; - struct sockaddr_in addr_in; - }; - union { - struct sockaddr src_addr; - struct sockaddr_in src_addr_in; - }; + struct sockaddr_storage addr; + struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; }; @@ -569,11 +562,12 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, return PTR_ERR(queue->cm_id); } - queue->cm_error = -ETIMEDOUT; if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) - src_addr = &ctrl->src_addr; + src_addr = (struct sockaddr *)&ctrl->src_addr; - ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr, + queue->cm_error = -ETIMEDOUT; + ret = rdma_resolve_addr(queue->cm_id, src_addr, + (struct sockaddr *)&ctrl->addr, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { dev_info(ctrl->ctrl.device, @@ -712,6 +706,26 @@ free_ctrl: kfree(ctrl); } +static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_LIVE); + return; + } + + if (nvmf_should_reconnect(&ctrl->ctrl)) { + dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", + ctrl->ctrl.opts->reconnect_delay); + queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + ctrl->ctrl.opts->reconnect_delay * HZ); + } else { + dev_info(ctrl->ctrl.device, "Removing controller...\n"); + queue_work(nvme_rdma_wq, &ctrl->delete_work); + } +} + static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), @@ -719,6 +733,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) bool changed; int ret; + ++ctrl->ctrl.opts->nr_reconnects; + if (ctrl->queue_count > 1) { nvme_rdma_free_io_queues(ctrl); @@ -763,6 +779,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); + ctrl->ctrl.opts->nr_reconnects = 0; if (ctrl->queue_count > 1) { nvme_start_queues(&ctrl->ctrl); @@ -777,13 +794,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) stop_admin_q: blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); requeue: - /* Make sure we are not resetting/deleting */ - if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) { - dev_info(ctrl->ctrl.device, - "Failed reconnect attempt, requeueing...\n"); - queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, - ctrl->reconnect_delay * HZ); - } + dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", + ctrl->ctrl.opts->nr_reconnects); + nvme_rdma_reconnect_or_remove(ctrl); } static void nvme_rdma_error_recovery_work(struct work_struct *work) @@ -810,11 +823,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); - dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n", - ctrl->reconnect_delay); - - queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, - ctrl->reconnect_delay * HZ); + nvme_rdma_reconnect_or_remove(ctrl); } static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) @@ -1509,27 +1518,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_rdma_queue *queue = req->queue; - int error = 0; - - nvme_rdma_unmap_data(queue, rq); - if (unlikely(rq->errors)) { - if (nvme_req_needs_retry(rq, rq->errors)) { - nvme_requeue_req(rq); - return; - } - - if (blk_rq_is_passthrough(rq)) - error = rq->errors; - else - error = nvme_error_status(rq->errors); - } - - blk_mq_end_request(rq, error); + nvme_rdma_unmap_data(req->queue, rq); + nvme_complete_rq(rq); } -static struct blk_mq_ops nvme_rdma_mq_ops = { +static const struct blk_mq_ops nvme_rdma_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_request, @@ -1540,7 +1534,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = { .timeout = nvme_rdma_timeout, }; -static struct blk_mq_ops nvme_rdma_admin_mq_ops = { +static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, .init_request = nvme_rdma_init_admin_request, @@ -1857,27 +1851,13 @@ out_free_io_queues: return ret; } -static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p) -{ - u8 *addr = (u8 *)&in_addr->sin_addr.s_addr; - size_t buflen = strlen(p); - - /* XXX: handle IPv6 addresses */ - - if (buflen > INET_ADDRSTRLEN) - return -EINVAL; - if (in4_pton(p, buflen, addr, '\0', NULL) == 0) - return -EINVAL; - in_addr->sin_family = AF_INET; - return 0; -} - static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_rdma_ctrl *ctrl; int ret; bool changed; + char *port; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) @@ -1885,40 +1865,33 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); - ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr); + if (opts->mask & NVMF_OPT_TRSVCID) + port = opts->trsvcid; + else + port = __stringify(NVME_RDMA_IP_PORT); + + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->traddr, port, &ctrl->addr); if (ret) { - pr_err("malformed IP address passed: %s\n", opts->traddr); + pr_err("malformed address passed: %s:%s\n", opts->traddr, port); goto out_free_ctrl; } if (opts->mask & NVMF_OPT_HOST_TRADDR) { - ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in, - opts->host_traddr); + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, &ctrl->src_addr); if (ret) { - pr_err("malformed src IP address passed: %s\n", + pr_err("malformed src address passed: %s\n", opts->host_traddr); goto out_free_ctrl; } } - if (opts->mask & NVMF_OPT_TRSVCID) { - u16 port; - - ret = kstrtou16(opts->trsvcid, 0, &port); - if (ret) - goto out_free_ctrl; - - ctrl->addr_in.sin_port = cpu_to_be16(port); - } else { - ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT); - } - ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) goto out_free_ctrl; - ctrl->reconnect_delay = opts->reconnect_delay; INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); @@ -1977,7 +1950,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); kref_get(&ctrl->ctrl.kref); @@ -2013,7 +1986,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .name = "rdma", .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | - NVMF_OPT_HOST_TRADDR, + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, .create_ctrl = nvme_rdma_create_ctrl, }; @@ -2055,12 +2028,20 @@ static int __init nvme_rdma_init_module(void) return -ENOMEM; ret = ib_register_client(&nvme_rdma_ib_client); - if (ret) { - destroy_workqueue(nvme_rdma_wq); - return ret; - } + if (ret) + goto err_destroy_wq; + + ret = nvmf_register_transport(&nvme_rdma_transport); + if (ret) + goto err_unreg_client; + + return 0; - return nvmf_register_transport(&nvme_rdma_transport); +err_unreg_client: + ib_unregister_client(&nvme_rdma_ib_client); +err_destroy_wq: + destroy_workqueue(nvme_rdma_wq); + return ret; } static void __exit nvme_rdma_cleanup_module(void) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 76450b0c55f1..ff1f97006322 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -121,7 +121,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) } switch (req->cmd->get_log_page.lid) { - case 0x01: + case NVME_LOG_ERROR: /* * We currently never set the More bit in the status field, * so all error log entries are invalid and can be zeroed out. @@ -129,7 +129,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) * mandatory log page. */ break; - case 0x02: + case NVME_LOG_SMART: /* * XXX: fill out actual smart log * @@ -149,7 +149,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) goto err; } break; - case 0x03: + case NVME_LOG_FW_SLOT: /* * We only support a single firmware slot which always is * active, so we can zero out the whole firmware slot log and @@ -480,31 +480,25 @@ static void nvmet_execute_keep_alive(struct nvmet_req *req) nvmet_req_complete(req, 0); } -int nvmet_parse_admin_cmd(struct nvmet_req *req) +u16 nvmet_parse_admin_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; + u16 ret; req->ns = NULL; - if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { - pr_err("nvmet: got admin cmd %d while CC.EN == 0\n", - cmd->common.opcode); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; - } - if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { - pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n", - cmd->common.opcode); - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; - } + ret = nvmet_check_ctrl_status(req, cmd); + if (unlikely(ret)) + return ret; switch (cmd->common.opcode) { case nvme_admin_get_log_page: req->data_len = nvmet_get_log_page_len(cmd); switch (cmd->get_log_page.lid) { - case 0x01: - case 0x02: - case 0x03: + case NVME_LOG_ERROR: + case NVME_LOG_SMART: + case NVME_LOG_FW_SLOT: req->execute = nvmet_execute_get_log_page; return 0; } @@ -545,6 +539,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req) return 0; } - pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, + req->sq->qid); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 798653b329b2..cf90713043da 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -273,8 +273,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns) ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(ns->bdev)) { - pr_err("nvmet: failed to open block device %s: (%ld)\n", - ns->device_path, PTR_ERR(ns->bdev)); + pr_err("failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); ret = PTR_ERR(ns->bdev); ns->bdev = NULL; goto out_unlock; @@ -661,6 +661,23 @@ out: return status; } +u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) +{ + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n", + cmd->common.opcode, req->sq->qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n", + cmd->common.opcode, req->sq->qid); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + return 0; +} + static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) { diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index af8aabf05335..1aaf597e81fc 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -159,15 +159,15 @@ out: nvmet_req_complete(req, status); } -int nvmet_parse_discovery_cmd(struct nvmet_req *req) +u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; req->ns = NULL; if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { - pr_err("nvmet: got cmd %d while not ready\n", - cmd->common.opcode); + pr_err("got cmd %d while not ready\n", + cmd->common.opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } @@ -180,8 +180,8 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req) req->execute = nvmet_execute_get_disc_log_page; return 0; default: - pr_err("nvmet: unsupported get_log_page lid %d\n", - cmd->get_log_page.lid); + pr_err("unsupported get_log_page lid %d\n", + cmd->get_log_page.lid); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: @@ -192,17 +192,16 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req) nvmet_execute_identify_disc_ctrl; return 0; default: - pr_err("nvmet: unsupported identify cns %d\n", - cmd->identify.cns); + pr_err("unsupported identify cns %d\n", + cmd->identify.cns); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } default: - pr_err("nvmet: unsupported cmd %d\n", - cmd->common.opcode); + pr_err("unsupported cmd %d\n", cmd->common.opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } - pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + pr_err("unhandled cmd %d\n", cmd->common.opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 8bd022af3df6..2a3c15b57f6e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -73,7 +73,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) nvmet_req_complete(req, status); } -int nvmet_parse_fabrics_cmd(struct nvmet_req *req) +u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -214,7 +214,7 @@ out_ctrl_put: goto out; } -int nvmet_parse_connect_cmd(struct nvmet_req *req) +u16 nvmet_parse_connect_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 8f483ee7868c..2c0709f0a7d3 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1189,8 +1189,8 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, validation_errors[ret]); iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, - ELS_RJT_LOGIC, - ELS_EXPL_NONE, 0); + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); return; } @@ -1281,8 +1281,9 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport, iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, (ret == VERR_NO_ASSOC) ? - ELS_RJT_PROT : ELS_RJT_LOGIC, - ELS_EXPL_NONE, 0); + FCNVME_RJT_RC_INV_ASSOC : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); return; } @@ -1369,8 +1370,12 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport, validation_errors[ret]); iod->lsreq->rsplen = nvmet_fc_format_rjt(acc, NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd, - (ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC, - ELS_EXPL_NONE, 0); + (ret == VERR_NO_ASSOC) ? + FCNVME_RJT_RC_INV_ASSOC : + (ret == VERR_NO_CONN) ? + FCNVME_RJT_RC_INV_CONN : + FCNVME_RJT_RC_LOGIC, + FCNVME_RJT_EXP_NONE, 0); return; } @@ -1479,7 +1484,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport, default: iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf, NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd, - ELS_RJT_INVAL, ELS_EXPL_NONE, 0); + FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0); } nvmet_fc_xmt_ls_rsp(tgtport, iod); @@ -1619,6 +1624,8 @@ nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod) for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count) __free_page(sg_page(sg)); kfree(fod->data_sg); + fod->data_sg = NULL; + fod->data_sg_cnt = 0; } diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 6b0baa9caab9..a4b455156bb5 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -196,26 +196,19 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req) } } -int nvmet_parse_io_cmd(struct nvmet_req *req) +u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; + u16 ret; - if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { - pr_err("nvmet: got io cmd %d while CC.EN == 0\n", - cmd->common.opcode); + ret = nvmet_check_ctrl_status(req, cmd); + if (unlikely(ret)) { req->ns = NULL; - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; - } - - if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { - pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n", - cmd->common.opcode); - req->ns = NULL; - return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + return ret; } req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (!req->ns) + if (unlikely(!req->ns)) return NVME_SC_INVALID_NS | NVME_SC_DNR; switch (cmd->common.opcode) { @@ -237,7 +230,8 @@ int nvmet_parse_io_cmd(struct nvmet_req *req) req->execute = nvmet_execute_write_zeroes; return 0; default: - pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, + req->sq->qid); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 22f7bc6bac7f..33b431e4eec3 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -13,12 +13,10 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/scatterlist.h> -#include <linux/delay.h> #include <linux/blk-mq.h> #include <linux/nvme.h> #include <linux/module.h> #include <linux/parser.h> -#include <linux/t10-pi.h> #include "nvmet.h" #include "../host/nvme.h" #include "../host/fabrics.h" @@ -93,31 +91,26 @@ static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) static void nvme_loop_complete_rq(struct request *req) { struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); - int error = 0; nvme_cleanup_cmd(req); sg_free_table_chained(&iod->sg_table, true); + nvme_complete_rq(req); +} - if (unlikely(req->errors)) { - if (nvme_req_needs_retry(req, req->errors)) { - nvme_requeue_req(req); - return; - } - - if (blk_rq_is_passthrough(req)) - error = req->errors; - else - error = nvme_error_status(req->errors); - } +static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue) +{ + u32 queue_idx = nvme_loop_queue_idx(queue); - blk_mq_end_request(req, error); + if (queue_idx == 0) + return queue->ctrl->admin_tag_set.tags[queue_idx]; + return queue->ctrl->tag_set.tags[queue_idx - 1]; } static void nvme_loop_queue_response(struct nvmet_req *req) { - struct nvme_loop_iod *iod = - container_of(req, struct nvme_loop_iod, req); - struct nvme_completion *cqe = &iod->rsp; + struct nvme_loop_queue *queue = + container_of(req->sq, struct nvme_loop_queue, nvme_sq); + struct nvme_completion *cqe = req->rsp; /* * AEN requests are special as they don't time out and can @@ -125,13 +118,23 @@ static void nvme_loop_queue_response(struct nvmet_req *req) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 && + if (unlikely(nvme_loop_queue_idx(queue) == 0 && cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status, + nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); } else { - struct request *rq = blk_mq_rq_from_pdu(iod); + struct request *rq; + struct nvme_loop_iod *iod; + + rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id); + if (!rq) { + dev_err(queue->ctrl->ctrl.device, + "tag 0x%x on queue %d not found\n", + cqe->command_id, nvme_loop_queue_idx(queue)); + return; + } + iod = blk_mq_rq_to_pdu(rq); iod->nvme_req.result = cqe->result; blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1); } @@ -268,7 +271,7 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, return 0; } -static struct blk_mq_ops nvme_loop_mq_ops = { +static const struct blk_mq_ops nvme_loop_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, .init_request = nvme_loop_init_request, @@ -276,7 +279,7 @@ static struct blk_mq_ops nvme_loop_mq_ops = { .timeout = nvme_loop_timeout, }; -static struct blk_mq_ops nvme_loop_admin_mq_ops = { +static const struct blk_mq_ops nvme_loop_admin_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, .init_request = nvme_loop_init_admin_request, @@ -349,6 +352,19 @@ out_destroy_queues: return ret; } +static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) +{ + int i, ret; + + for (i = 1; i < ctrl->queue_count; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + return ret; + } + + return 0; +} + static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) { int error; @@ -490,7 +506,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, reset_work); bool changed; - int i, ret; + int ret; nvme_loop_shutdown_ctrl(ctrl); @@ -502,11 +518,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) if (ret) goto out_destroy_admin; - for (i = 1; i < ctrl->queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); - if (ret) - goto out_destroy_io; - } + ret = nvme_loop_connect_io_queues(ctrl); + if (ret) + goto out_destroy_io; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); @@ -559,7 +573,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) { - int ret, i; + int ret; ret = nvme_loop_init_io_queues(ctrl); if (ret) @@ -588,11 +602,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) goto out_free_tagset; } - for (i = 1; i < ctrl->queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); - if (ret) - goto out_cleanup_connect_q; - } + ret = nvme_loop_connect_io_queues(ctrl); + if (ret) + goto out_cleanup_connect_q; return 0; @@ -736,7 +748,12 @@ static int __init nvme_loop_init_module(void) ret = nvmet_register_transport(&nvme_loop_ops); if (ret) return ret; - return nvmf_register_transport(&nvme_loop_transport); + + ret = nvmf_register_transport(&nvme_loop_transport); + if (ret) + nvmet_unregister_transport(&nvme_loop_ops); + + return ret; } static void __exit nvme_loop_cleanup_module(void) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index f7ff15f17ca9..7cb77ba5993b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -253,11 +253,11 @@ struct nvmet_async_event { u8 log_page; }; -int nvmet_parse_connect_cmd(struct nvmet_req *req); -int nvmet_parse_io_cmd(struct nvmet_req *req); -int nvmet_parse_admin_cmd(struct nvmet_req *req); -int nvmet_parse_discovery_cmd(struct nvmet_req *req); -int nvmet_parse_fabrics_cmd(struct nvmet_req *req); +u16 nvmet_parse_connect_cmd(struct nvmet_req *req); +u16 nvmet_parse_io_cmd(struct nvmet_req *req); +u16 nvmet_parse_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); +u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); @@ -278,6 +278,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, struct nvmet_req *req, struct nvmet_ctrl **ret); void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); +u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd); struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, enum nvme_subsys_type type); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index ecc4fe862561..99c69018a35f 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1199,6 +1199,11 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, } queue->port = cm_id->context; + if (queue->host_qid == 0) { + /* Let inflight controller teardown complete */ + flush_scheduled_work(); + } + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) goto release_queue; @@ -1427,12 +1432,16 @@ restart: static int nvmet_rdma_add_port(struct nvmet_port *port) { struct rdma_cm_id *cm_id; - struct sockaddr_in addr_in; - u16 port_in; + struct sockaddr_storage addr = { }; + __kernel_sa_family_t af; int ret; switch (port->disc_addr.adrfam) { case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; break; default: pr_err("address family %d not supported\n", @@ -1440,13 +1449,13 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) return -EINVAL; } - ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in); - if (ret) + ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, + port->disc_addr.trsvcid, &addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + port->disc_addr.traddr, port->disc_addr.trsvcid); return ret; - - addr_in.sin_family = AF_INET; - addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr); - addr_in.sin_port = htons(port_in); + } cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1455,20 +1464,32 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) return PTR_ERR(cm_id); } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in); + /* + * Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ + ret = rdma_set_afonly(cm_id, 1); + if (ret) { + pr_err("rdma_set_afonly failed (%d)\n", ret); + goto out_destroy_id; + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); if (ret) { - pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", + (struct sockaddr *)&addr, ret); goto out_destroy_id; } ret = rdma_listen(cm_id, 128); if (ret) { - pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret); + pr_err("listening to %pISpcs failed (%d)\n", + (struct sockaddr *)&addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpc)\n", - le16_to_cpu(port->disc_addr.portid), &addr_in); + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); port->priv = cm_id; return 0; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 6ff61dad5e21..62fed9dc893e 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -183,11 +183,33 @@ static void jsfd_read(char *buf, unsigned long p, size_t togo) { } } -static void jsfd_do_request(struct request_queue *q) +static int jsfd_queue; + +static struct request *jsfd_next_request(void) +{ + struct request_queue *q; + struct request *rq; + int old_pos = jsfd_queue; + + do { + q = jsfd_disk[jsfd_queue]->queue; + if (++jsfd_queue == JSF_MAX) + jsfd_queue = 0; + if (q) { + rq = blk_fetch_request(q); + if (rq) + return rq; + } + } while (jsfd_queue != old_pos); + + return NULL; +} + +static void jsfd_request(void) { struct request *req; - req = blk_fetch_request(q); + req = jsfd_next_request(); while (req) { struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; @@ -211,10 +233,15 @@ static void jsfd_do_request(struct request_queue *q) err = 0; end: if (!__blk_end_request_cur(req, err)) - req = blk_fetch_request(q); + req = jsfd_next_request(); } } +static void jsfd_do_request(struct request_queue *q) +{ + jsfd_request(); +} + /* * The memory devices use the full 32/64 bits of the offset, and so we cannot * check against negative addresses: they are ok. The return value is weird, @@ -544,8 +571,6 @@ static int jsflash_init(void) return 0; } -static struct request_queue *jsf_queue; - static int jsfd_init(void) { static DEFINE_SPINLOCK(lock); @@ -562,6 +587,11 @@ static int jsfd_init(void) struct gendisk *disk = alloc_disk(1); if (!disk) goto out; + disk->queue = blk_init_queue(jsfd_do_request, &lock); + if (!disk->queue) { + put_disk(disk); + goto out; + } jsfd_disk[i] = disk; } @@ -570,13 +600,6 @@ static int jsfd_init(void) goto out; } - jsf_queue = blk_init_queue(jsfd_do_request, &lock); - if (!jsf_queue) { - err = -ENOMEM; - unregister_blkdev(JSFD_MAJOR, "jsfd"); - goto out; - } - for (i = 0; i < JSF_MAX; i++) { struct gendisk *disk = jsfd_disk[i]; if ((i & JSF_PART_MASK) >= JSF_NPART) continue; @@ -589,7 +612,6 @@ static int jsfd_init(void) disk->fops = &jsfd_fops; set_capacity(disk, jdp->dsize >> 9); disk->private_data = jdp; - disk->queue = jsf_queue; add_disk(disk); set_disk_ro(disk, 1); } @@ -619,6 +641,7 @@ static void __exit jsflash_cleanup_module(void) for (i = 0; i < JSF_MAX; i++) { if ((i & JSF_PART_MASK) >= JSF_NPART) continue; del_gendisk(jsfd_disk[i]); + blk_cleanup_queue(jsfd_disk[i]->queue); put_disk(jsfd_disk[i]); } if (jsf0.busy) @@ -628,7 +651,6 @@ static void __exit jsflash_cleanup_module(void) misc_deregister(&jsf_dev); unregister_blkdev(JSFD_MAJOR, "jsfd"); - blk_cleanup_queue(jsf_queue); } module_init(jsflash_init_module); diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 6903f03c88af..9d0727b2bdec 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1602,7 +1602,7 @@ static int _init_blk_request(struct osd_request *or, req->rq_flags |= RQF_QUIET; req->timeout = or->timeout; - req->retries = or->retries; + scsi_req(req)->retries = or->retries; if (has_out) { or->out.req = req; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index c47f4b349bac..41bc1d64bf86 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */ memcpy(rq->cmd, cmd, rq->cmd_len); req->timeout = timeout; - req->retries = retries; + rq->retries = retries; req->end_io_data = SRpnt; blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async); diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f2cafae150bc..2db412dd4b44 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->rq_flags |= RQF_QUIET; req->timeout = 10 * HZ; - req->retries = 5; + rq->retries = 5; blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done); } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e5a2d590a104..7bc4513bf4e4 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -256,7 +256,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, rq->cmd_len = COMMAND_SIZE(cmd[0]); memcpy(rq->cmd, cmd, rq->cmd_len); - req->retries = retries; + rq->retries = retries; req->timeout = timeout; req->cmd_flags |= flags; req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; @@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) cmd->cmd_len = scsi_req(req)->cmd_len; cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); - cmd->allowed = req->retries; + cmd->allowed = scsi_req(req)->retries; return BLKPREP_OK; } @@ -2154,7 +2154,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) return q; } -static struct blk_mq_ops scsi_mq_ops = { +static const struct blk_mq_ops scsi_mq_ops = { .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 29b86505f796..b61cc3c512d3 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1716,7 +1716,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) srp->rq = rq; rq->end_io_data = srp; - rq->retries = SG_DEFAULT_RETRIES; + req->retries = SG_DEFAULT_RETRIES; if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE)) return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index e5ef78a6848e..5408643431bb 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, memset(rq->cmd, 0, BLK_MAX_CDB); memcpy(rq->cmd, cmd, rq->cmd_len); req->timeout = timeout; - req->retries = retries; + rq->retries = retries; req->end_io_data = SRpnt; blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end); diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index bf40f03755dd..f30c27b83c5e 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -167,10 +167,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg( struct iscsi_portal_group *tpg; struct iscsi_tpg_np *tpg_np; char *str, *str2, *ip_str, *port_str; - struct sockaddr_storage sockaddr; - struct sockaddr_in *sock_in; - struct sockaddr_in6 *sock_in6; - unsigned long port; + struct sockaddr_storage sockaddr = { }; int ret; char buf[MAX_PORTAL_LEN + 1]; @@ -182,21 +179,19 @@ static struct se_tpg_np *lio_target_call_addnptotpg( memset(buf, 0, MAX_PORTAL_LEN + 1); snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name); - memset(&sockaddr, 0, sizeof(struct sockaddr_storage)); - str = strstr(buf, "["); if (str) { - const char *end; - str2 = strstr(str, "]"); if (!str2) { pr_err("Unable to locate trailing \"]\"" " in IPv6 iSCSI network portal address\n"); return ERR_PTR(-EINVAL); } - str++; /* Skip over leading "[" */ + + ip_str = str + 1; /* Skip over leading "[" */ *str2 = '\0'; /* Terminate the unbracketed IPv6 address */ str2++; /* Skip over the \0 */ + port_str = strstr(str2, ":"); if (!port_str) { pr_err("Unable to locate \":port\"" @@ -205,23 +200,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg( } *port_str = '\0'; /* Terminate string for IP */ port_str++; /* Skip over ":" */ - - ret = kstrtoul(port_str, 0, &port); - if (ret < 0) { - pr_err("kstrtoul() failed for port_str: %d\n", ret); - return ERR_PTR(ret); - } - sock_in6 = (struct sockaddr_in6 *)&sockaddr; - sock_in6->sin6_family = AF_INET6; - sock_in6->sin6_port = htons((unsigned short)port); - ret = in6_pton(str, -1, - (void *)&sock_in6->sin6_addr.in6_u, -1, &end); - if (ret <= 0) { - pr_err("in6_pton returned: %d\n", ret); - return ERR_PTR(-EINVAL); - } } else { - str = ip_str = &buf[0]; + ip_str = &buf[0]; port_str = strstr(ip_str, ":"); if (!port_str) { pr_err("Unable to locate \":port\"" @@ -230,17 +210,15 @@ static struct se_tpg_np *lio_target_call_addnptotpg( } *port_str = '\0'; /* Terminate string for IP */ port_str++; /* Skip over ":" */ + } - ret = kstrtoul(port_str, 0, &port); - if (ret < 0) { - pr_err("kstrtoul() failed for port_str: %d\n", ret); - return ERR_PTR(ret); - } - sock_in = (struct sockaddr_in *)&sockaddr; - sock_in->sin_family = AF_INET; - sock_in->sin_port = htons((unsigned short)port); - sock_in->sin_addr.s_addr = in_aton(ip_str); + ret = inet_pton_with_scope(&init_net, AF_UNSPEC, ip_str, + port_str, &sockaddr); + if (ret) { + pr_err("malformed ip/port passed: %s\n", name); + return ERR_PTR(ret); } + tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg); ret = iscsit_get_tpg(tpg); if (ret < 0) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 94cda7991e80..c7fa372c527a 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd) req->timeout = PS_TIMEOUT_DISK; else req->timeout = PS_TIMEOUT_OTHER; - req->retries = PS_RETRY; + scsi_req(req)->retries = PS_RETRY; blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req, (cmd->sam_task_attr == TCM_HEAD_TAG), diff --git a/fs/block_dev.c b/fs/block_dev.c index 2eca00ec4370..f2d59f143ef4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + /* Detach inode from wb early as bdi_put() may free bdi->wb */ + inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; @@ -1556,8 +1558,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1622,6 +1622,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } + + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; @@ -1653,8 +1656,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1876,12 +1877,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* - * Detaching bdev inode from its wb in __destroy_inode() - * is too late: the queue which embeds its bdi (along with - * root wb) can be gone as soon as we put_disk() below. - */ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ad955817916d..e66d4722db8e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ + WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; @@ -54,7 +55,9 @@ struct bdi_writeback_congested { atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *bdi; /* the associated bdi */ + struct backing_dev_info *__bdi; /* the associated bdi, set to NULL + * on bdi unregistration. For memcg-wb + * internal use only! */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif @@ -161,7 +164,6 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ - atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/include/linux/bio.h b/include/linux/bio.h index 8e521194f6fc..4931756d86d9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); +extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9382c5da7a2e..b90c3d5766cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -82,7 +82,6 @@ struct blk_mq_tag_set { struct blk_mq_queue_data { struct request *rq; - struct list_head *list; bool last; }; @@ -153,7 +152,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, @@ -246,7 +244,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); -void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d703acb55d0f..72aa9519167e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,10 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct blk_issue_stat { + u64 stat; +}; + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -29,7 +33,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, command, etc */ + unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -58,6 +62,10 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + void *bi_cg_private; + struct blk_issue_stat bi_issue_stat; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -102,12 +110,9 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ - -/* - * Flags starting here get preserved by bio_reset() - this includes - * BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS 10 +#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion + * of this bio. */ +/* See BVEC_POOL_OFFSET below before adding new flags */ /* * We support 6 different bvec pools, the last one is magic in that it @@ -117,13 +122,22 @@ struct bio { #define BVEC_POOL_MAX (BVEC_POOL_NR - 1) /* - * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * Top 3 bits of bio flags indicate the pool the bvecs came from. We add * 1 to the actual index so that 0 indicates that there are no bvecs to be * freed. */ -#define BVEC_POOL_BITS (4) +#define BVEC_POOL_BITS (3) #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) +#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) +# error "BVEC_POOL_BITS is too small" +#endif + +/* + * Flags starting here get preserved by bio_reset() - this includes + * only BVEC_POOL_IDX() + */ +#define BIO_RESET_BITS BVEC_POOL_OFFSET /* * Operations and flags common to the bio and request structures. @@ -283,12 +297,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) return (cookie & BLK_QC_T_INTERNAL) != 0; } -struct blk_issue_stat { - u64 time; -}; - -#define BLK_RQ_STAT_BATCH 64 - struct blk_rq_stat { s64 mean; u64 min; @@ -296,7 +304,6 @@ struct blk_rq_stat { s32 nr_samples; s32 nr_batch; u64 batch; - s64 time; }; #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7548f332121a..d76bebbc632e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -40,6 +40,8 @@ struct blkcg_gq; struct blk_flush_queue; struct pr_ops; struct rq_wb; +struct blk_queue_stats; +struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -213,6 +215,8 @@ struct request { unsigned short ioprio; + unsigned int timeout; + void *special; /* opaque pointer available for LLD use */ int errors; @@ -221,8 +225,6 @@ struct request { unsigned long deadline; struct list_head timeout_list; - unsigned int timeout; - int retries; /* * completion callback. @@ -388,6 +390,7 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct blk_queue_stats *stats; struct rq_wb *rq_wb; /* @@ -505,8 +508,6 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; - struct blk_rq_stat rq_stats[2]; - /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -516,6 +517,10 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + + struct blk_stat_callback *poll_cb; + struct blk_rq_stat poll_stat[2]; + struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; @@ -610,6 +615,8 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0..9e11082c7f9b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,11 +159,11 @@ struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/include/linux/inet.h b/include/linux/inet.h index 4cca05c9678e..636ebe87e6f8 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -43,6 +43,8 @@ #define _LINUX_INET_H #include <linux/types.h> +#include <net/net_namespace.h> +#include <linux/socket.h> /* * These mimic similar macros defined in user-space for inet_ntop(3). @@ -54,4 +56,8 @@ extern __be32 in_aton(const char *str); extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); + +extern int inet_pton_with_scope(struct net *net, unsigned short af, + const char *src, const char *port, struct sockaddr_storage *addr); + #endif /* _LINUX_INET_H */ diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e6284591599e..ca85cb80e99a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); +extern struct kobject * __must_check kobject_get_unless_zero( + struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index f21471f7ee40..16eb264980c2 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir { * transferred. Should equal payload_length on success. * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @status: Completion status of the FCP operation. must be 0 upon success, - * NVME_SC_FC_xxx value upon failure. Note: this is NOT a - * reflection of the NVME CQE completion status. Only the status - * of the FCP operation at the NVME-FC level. + * negative errno value upon failure (ex: -EIO). Note: this is + * NOT a reflection of the NVME CQE completion status. Only the + * status of the FCP operation at the NVME-FC level. */ struct nvmefc_fcp_req { void *cmdaddr; diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 4b45226bd604..e997c4a49a88 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -16,8 +16,7 @@ */ /* - * This file contains definitions relative to FC-NVME r1.11 and a few - * newer items + * This file contains definitions relative to FC-NVME r1.14 (16-020vB). */ #ifndef _NVME_FC_H @@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu { #define NVME_FC_SIZEOF_ZEROS_RSP 12 +enum { + FCNVME_SC_SUCCESS = 0, + FCNVME_SC_INVALID_FIELD = 1, + FCNVME_SC_INVALID_CONNID = 2, +}; + struct nvme_fc_ersp_iu { - __u8 rsvd0[2]; + __u8 status_code; + __u8 rsvd1; __be16 iu_len; __be32 rsn; __be32 xfrd_len; @@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME r1.03/16-119v0 NVME Link Services */ +/* FC-NVME Link Services */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, @@ -68,7 +74,7 @@ enum { FCNVME_LS_DISCONNECT = 5, }; -/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +/* FC-NVME Link Service Descriptors */ enum { FCNVME_LSDESC_RSVD = 0x0, FCNVME_LSDESC_RQST = 0x1, @@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz) return cpu_to_be32(sz - (2 * sizeof(u32))); } - struct fcnvme_ls_rqst_w0 { u8 ls_cmd; /* FCNVME_LS_xxx */ u8 zeros[3]; @@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst { __be32 rsvd12; }; +/* FC-NVME LS RJT reason_code values */ +enum fcnvme_ls_rjt_reason { + FCNVME_RJT_RC_NONE = 0, + /* no reason - not to be sent */ + + FCNVME_RJT_RC_INVAL = 0x01, + /* invalid NVMe_LS command code */ + + FCNVME_RJT_RC_LOGIC = 0x03, + /* logical error */ + + FCNVME_RJT_RC_UNAB = 0x09, + /* unable to perform command request */ + + FCNVME_RJT_RC_UNSUP = 0x0b, + /* command not supported */ + + FCNVME_RJT_RC_INPROG = 0x0e, + /* command already in progress */ + FCNVME_RJT_RC_INV_ASSOC = 0x40, + /* Invalid Association ID*/ + FCNVME_RJT_RC_INV_CONN = 0x41, + /* Invalid Connection ID*/ + + FCNVME_RJT_RC_VENDOR = 0xff, + /* vendor specific error */ +}; + +/* FC-NVME LS RJT reason_explanation values */ +enum fcnvme_ls_rjt_explan { + FCNVME_RJT_EXP_NONE = 0x00, + /* No additional explanation */ + + FCNVME_RJT_EXP_OXID_RXID = 0x17, + /* invalid OX_ID-RX_ID combination */ + + FCNVME_RJT_EXP_INSUF_RES = 0x29, + /* insufficient resources */ + + FCNVME_RJT_EXP_UNAB_DATA = 0x2a, + /* unable to supply requested data */ + + FCNVME_RJT_EXP_INV_LEN = 0x2d, + /* Invalid payload length */ +}; /* FCNVME_LSDESC_RJT */ struct fcnvme_lsdesc_rjt { @@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt { * Reject reason and explanaction codes are generic * to ELs's from LS-3. */ - u8 reason_code; - u8 reason_explanation; + u8 reason_code; /* fcnvme_ls_rjt_reason */ + u8 reason_explanation; /* fcnvme_ls_rjt_explan */ u8 vendor; __be32 rsvd12; }; -#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTID_LEN 16 #define FCNVME_ASSOC_HOSTNQN_LEN 256 #define FCNVME_ASSOC_SUBNQN_LEN 256 diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 9fba9dd33544..9375d23a24e7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -34,9 +34,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity_profile t10_pi_type1_crc; -extern struct blk_integrity_profile t10_pi_type1_ip; -extern struct blk_integrity_profile t10_pi_type3_crc; -extern struct blk_integrity_profile t10_pi_type3_ip; +extern const struct blk_integrity_profile t10_pi_type1_crc; +extern const struct blk_integrity_profile t10_pi_type1_ip; +extern const struct blk_integrity_profile t10_pi_type3_crc; +extern const struct blk_integrity_profile t10_pi_type3_ip; #endif diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a3c0cbd7c888..d5815794416c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index ba0aeb980f7e..7c583a0f363a 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -11,6 +11,7 @@ struct scsi_request { unsigned short cmd_len; unsigned int sense_len; unsigned int resid_len; /* residual count */ + int retries; void *sense; }; diff --git a/lib/kobject.c b/lib/kobject.c index 445dcaeb0f56..763d70a18941 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj) } EXPORT_SYMBOL(kobject_get); -static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) +struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { + if (!kobj) + return NULL; if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; return kobj; } +EXPORT_SYMBOL(kobject_get_unless_zero); /* * kobject_cleanup - free kobject resources. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c6f2a37028c2..3ea3bbd921d6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); + if (wb != &bdi->wb) + bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, wb->dirty_sleep = jiffies; wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) - return -ENOMEM; + if (!wb->congested) { + err = -ENOMEM; + goto out_put_bdi; + } err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -335,9 +339,14 @@ out_destroy_stat: fprop_local_destroy_percpu(&wb->completions); out_put_cong: wb_congested_put(wb->congested); +out_put_bdi: + if (wb != &bdi->wb) + bdi_put(bdi); return err; } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); + /* * Remove bdi from the global list and shutdown any threads we have running */ @@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb) spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); + /* + * Wait for wb shutdown to finish if someone else is just + * running wb_shutdown(). Otherwise we could proceed to wb / + * bdi destruction before wb_shutdown() is finished. + */ + wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); return; } + set_bit(WB_shutting_down, &wb->state); spin_unlock_bh(&wb->work_lock); + cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to @@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + /* + * Make sure bit gets cleared after shutdown is finished. Matches with + * the barrier provided by test_and_clear_bit() above. + */ + smp_wmb(); + clear_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb) fprop_local_destroy_percpu(&wb->completions); wb_congested_put(wb->congested); + if (wb != &wb->bdi->wb) + bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb) /* * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. cgwb_release_wait is used to wait for the completion of cgwb - * releases from bdi destruction path. + * protected. */ static DEFINE_SPINLOCK(cgwb_lock); -static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); /** * wb_congested_get_create - get or create a wb_congested @@ -438,7 +461,7 @@ retry: return NULL; atomic_set(&new_congested->refcnt, 0); - new_congested->bdi = bdi; + new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; @@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested) } /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->bdi) { + if (congested->__bdi) { rb_erase(&congested->rb_node, - &congested->bdi->cgwb_congested_tree); - congested->bdi = NULL; + &congested->__bdi->cgwb_congested_tree); + congested->__bdi = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); - struct backing_dev_info *bdi = wb->bdi; - - spin_lock_irq(&cgwb_lock); - list_del_rcu(&wb->bdi_node); - spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); @@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); - - if (atomic_dec_and_test(&bdi->usage_cnt)) - wake_up_all(&cgwb_release_wait); } static void cgwb_release(struct percpu_ref *refcnt) @@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb) percpu_ref_kill(&wb->refcnt); } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); +} + static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { @@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi, /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { - atomic_inc(&bdi->usage_cnt); list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); @@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; - atomic_set(&bdi->usage_cnt, 1); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return ret; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; + struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - spin_unlock_irq(&cgwb_lock); - /* - * All cgwb's must be shutdown and released before returning. Drain - * the usage counter to wait for all cgwb's ever created on @bdi. - */ - atomic_dec(&bdi->usage_cnt); - wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); - /* - * Grab back our reference so that we hold it when @bdi gets - * re-registered. - */ - atomic_inc(&bdi->usage_cnt); + while (!list_empty(&bdi->wb_list)) { + wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, + bdi_node); + spin_unlock_irq(&cgwb_lock); + wb_shutdown(wb); + spin_lock_irq(&cgwb_lock); + } + spin_unlock_irq(&cgwb_lock); } /** @@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) rb_entry(rbn, struct bdi_writeback_congested, rb_node); rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ + congested->__bdi = NULL; /* mark @congested unlinked */ } spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + spin_lock_irq(&cgwb_lock); + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -777,13 +801,23 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_exit(struct backing_dev_info *bdi) { wb_congested_put(bdi->wb_congested); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + list_del_rcu(&wb->bdi_node); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) @@ -802,8 +836,6 @@ int bdi_init(struct backing_dev_info *bdi) ret = cgwb_bdi_init(bdi); - list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); - return ret; } EXPORT_SYMBOL(bdi_init); @@ -839,6 +871,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (IS_ERR(dev)) return PTR_ERR(dev); + cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); @@ -892,7 +925,7 @@ void bdi_unregister(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); - cgwb_bdi_destroy(bdi); + cgwb_bdi_unregister(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); diff --git a/net/core/utils.c b/net/core/utils.c index 6592d7bbed39..f96cf527bb8f 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -26,9 +26,11 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/ratelimit.h> +#include <linux/socket.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/uaccess.h> @@ -300,6 +302,107 @@ out: } EXPORT_SYMBOL(in6_pton); +static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + int srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + int srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id), + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + }; + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr) { |