aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/queue-sysfs.txt6
-rw-r--r--block/Kconfig12
-rw-r--r--block/bio.c19
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c39
-rw-r--r--block/blk-flush.c3
-rw-r--r--block/blk-integrity.c2
-rw-r--r--block/blk-lib.c2
-rw-r--r--block/blk-mq-debugfs.c99
-rw-r--r--block/blk-mq-pci.c2
-rw-r--r--block/blk-mq.c370
-rw-r--r--block/blk-mq.h1
-rw-r--r--block/blk-stat.c327
-rw-r--r--block/blk-stat.h214
-rw-r--r--block/blk-sysfs.c55
-rw-r--r--block/blk-throttle.c977
-rw-r--r--block/blk-wbt.c61
-rw-r--r--block/blk-wbt.h12
-rw-r--r--block/blk.h12
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/genhd.c2
-rw-r--r--block/scsi_ioctl.c8
-rw-r--r--block/sed-opal.c2
-rw-r--r--block/t10-pi.c8
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/hd.c803
-rw-r--r--drivers/block/loop.c2
-rw-r--r--drivers/block/mg_disk.c4
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c4
-rw-r--r--drivers/block/nbd.c2
-rw-r--r--drivers/block/null_blk.c11
-rw-r--r--drivers/block/paride/pcd.c57
-rw-r--r--drivers/block/paride/pd.c50
-rw-r--r--drivers/block/paride/pf.c57
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/block/swim.c55
-rw-r--r--drivers/block/virtio_blk.c2
-rw-r--r--drivers/block/xen-blkfront.c2
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/raid5.c2
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/nvme/host/core.c53
-rw-r--r--drivers/nvme/host/fabrics.c28
-rw-r--r--drivers/nvme/host/fabrics.h10
-rw-r--r--drivers/nvme/host/fc.c57
-rw-r--r--drivers/nvme/host/nvme.h12
-rw-r--r--drivers/nvme/host/pci.c40
-rw-r--r--drivers/nvme/host/rdma.c149
-rw-r--r--drivers/nvme/target/admin-cmd.c31
-rw-r--r--drivers/nvme/target/core.c21
-rw-r--r--drivers/nvme/target/discovery.c19
-rw-r--r--drivers/nvme/target/fabrics-cmd.c4
-rw-r--r--drivers/nvme/target/fc.c21
-rw-r--r--drivers/nvme/target/io-cmd.c22
-rw-r--r--drivers/nvme/target/loop.c89
-rw-r--r--drivers/nvme/target/nvmet.h11
-rw-r--r--drivers/nvme/target/rdma.c47
-rw-r--r--drivers/sbus/char/jsflash.c50
-rw-r--r--drivers/scsi/osd/osd_initiator.c2
-rw-r--r--drivers/scsi/osst.c2
-rw-r--r--drivers/scsi/scsi_error.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/target/iscsi/iscsi_target_configfs.c46
-rw-r--r--drivers/target/target_core_pscsi.c2
-rw-r--r--fs/block_dev.c15
-rw-r--r--include/linux/backing-dev-defs.h6
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h4
-rw-r--r--include/linux/blk_types.h39
-rw-r--r--include/linux/blkdev.h15
-rw-r--r--include/linux/genhd.h10
-rw-r--r--include/linux/inet.h6
-rw-r--r--include/linux/kobject.h2
-rw-r--r--include/linux/nvme-fc-driver.h6
-rw-r--r--include/linux/nvme-fc.h68
-rw-r--r--include/linux/t10-pi.h8
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/scsi/scsi_request.h1
-rw-r--r--lib/kobject.c5
-rw-r--r--mm/backing-dev.c107
-rw-r--r--net/core/utils.c103
86 files changed, 2566 insertions, 2008 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index c0a3bb5a6e4e..b7f6bdc96d73 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -192,5 +192,11 @@ scaling back writes. Writing a value of '0' to this file disables the
feature. Writing a value of '-1' to this file resets the value to the
default setting.
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
Jens Axboe <[email protected]>, February 2009
diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5..89cd28f8d051 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING
See Documentation/cgroups/blkio-controller.txt for more information.
+config BLK_DEV_THROTTLING_LOW
+ bool "Block throttling .low limit interface support (EXPERIMENTAL)"
+ depends on BLK_DEV_THROTTLING
+ default n
+ ---help---
+ Add .low limit interface for block throttling. The low limit is a best
+ effort limit to prioritize cgroups. Depending on the setting, the limit
+ can be used to protect cgroups in terms of bandwidth/iops and better
+ utilize disk resource.
+
+ Note, this is an experimental interface and could be changed someday.
+
config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..f4d207180266 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
#include <linux/cgroup.h>
#include <trace/events/block.h>
+#include "blk.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+ struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
@@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
* bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
* way to end I/O on a bio. No one should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io function.
+ *
+ * bio_endio() can be called several times on a bio that has been chained
+ * using bio_chain(). The ->bi_end_io() function will only be called the
+ * last time. At this point the BLK_TA_COMPLETE tracing event will be
+ * generated if BIO_TRACE_COMPLETION is set.
**/
void bio_endio(struct bio *bio)
{
@@ -1844,6 +1851,13 @@ again:
goto again;
}
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+ bio, bio->bi_error);
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ }
+
+ blk_throtl_bio_endio(bio);
if (bio->bi_end_io)
bio->bi_end_io(bio);
}
@@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
+ if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
return split;
}
EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
}
EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert_held(q->queue_lock);
+
+ if (!blkcg_policy_enabled(q, pol))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * This could be the first entry point of blkcg implementation and
+ * we shouldn't allow anything to go through for a bypassing queue.
+ */
+ if (unlikely(blk_queue_bypass(q)))
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+ return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
+ struct request_queue *q;
struct blkcg_gq *blkg;
struct module *owner;
unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
- rcu_read_lock();
- spin_lock_irq(disk->queue->queue_lock);
+ q = disk->queue;
- if (blkcg_policy_enabled(disk->queue, pol))
- blkg = blkg_lookup_create(blkcg, disk->queue);
- else
- blkg = ERR_PTR(-EOPNOTSUPP);
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg)
+ goto success;
+
+ /*
+ * Create blkgs walking down from blkcg_root to @blkcg, so that all
+ * non-root blkgs have access to their parents.
+ */
+ while (true) {
+ struct blkcg *pos = blkcg;
+ struct blkcg *parent;
+ struct blkcg_gq *new_blkg;
+
+ parent = blkcg_parent(blkcg);
+ while (parent && !__blkg_lookup(parent, q, false)) {
+ pos = parent;
+ parent = blkcg_parent(parent);
+ }
+
+ /* Drop locks to do new blkg allocation with GFP_KERNEL. */
+ spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
- spin_unlock_irq(disk->queue->queue_lock);
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- /*
- * If queue was bypassing, we should retry. Do so after a
- * short msleep(). It isn't strictly necessary but queue
- * can be bypassing for some time and it's always nice to
- * avoid busy looping.
- */
- if (ret == -EBUSY) {
- msleep(10);
- ret = restart_syscall();
+
+ new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+ if (unlikely(!new_blkg)) {
+ ret = -ENOMEM;
+ goto fail;
}
- return ret;
- }
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+
+ blkg = blkg_lookup_check(pos, pol, q);
+ if (IS_ERR(blkg)) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg) {
+ blkg_free(new_blkg);
+ } else {
+ blkg = blkg_create(pos, q, new_blkg);
+ if (unlikely(IS_ERR(blkg))) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+ }
+
+ if (pos == blkcg)
+ goto success;
+ }
+success:
ctx->disk = disk;
ctx->blkg = blkg;
ctx->body = body;
return 0;
+
+fail_unlock:
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+fail:
+ owner = disk->fops->owner;
+ put_disk(disk);
+ module_put(owner);
+ /*
+ * If queue was bypassing, we should retry. Do so after a
+ * short msleep(). It isn't strictly necessary but queue
+ * can be bypassing for some time and it's always nice to
+ * avoid busy looping.
+ */
+ if (ret == -EBUSY) {
+ msleep(10);
+ ret = restart_syscall();
+ }
+ return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
diff --git a/block/blk-core.c b/block/blk-core.c
index d772c221cc17..8654aa0cef6d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -500,6 +500,13 @@ void blk_set_queue_dying(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
+ /*
+ * When queue DYING flag is set, we need to block new req
+ * entering queue, so we call blk_freeze_queue_start() to
+ * prevent I/O from crossing blk_queue_enter().
+ */
+ blk_freeze_queue_start(q);
+
if (q->mq_ops)
blk_mq_wake_waiters(q);
else {
@@ -669,6 +676,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
if (nowait)
return -EBUSY;
+ /*
+ * read pair of barrier in blk_freeze_queue_start(),
+ * we need to order reading __PERCPU_REF_DEAD flag of
+ * .q_usage_counter and reading .mq_freeze_depth or
+ * queue dying flag, otherwise the following wait may
+ * never return if the two reads are reordered.
+ */
+ smp_rmb();
+
ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(&q->mq_freeze_depth) ||
blk_queue_dying(q));
@@ -720,6 +736,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q->backing_dev_info)
goto fail_split;
+ q->stats = blk_alloc_queue_stats();
+ if (!q->stats)
+ goto fail_stats;
+
q->backing_dev_info->ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -776,6 +796,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
+ blk_free_queue_stats(q->stats);
+fail_stats:
bdi_put(q->backing_dev_info);
fail_split:
bioset_free(q->bio_split);
@@ -889,7 +911,6 @@ out_exit_flush_rq:
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
- wbt_exit(q);
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1128,7 +1149,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_init(q, rq);
blk_rq_set_rl(rq, rl);
- blk_rq_set_prio(rq, ioc);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
@@ -1615,6 +1635,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
+ blk_rq_set_prio(req, rq_ioc(bio));
if (ioprio_valid(bio_prio(bio)))
req->ioprio = bio_prio(bio);
blk_rq_bio_prep(req->q, req, bio);
@@ -1936,7 +1957,13 @@ generic_make_request_checks(struct bio *bio)
if (!blkcg_bio_issue_check(q, bio))
return false;
- trace_block_bio_queue(q, bio);
+ if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_queue(q, bio);
+ /* Now that enqueuing has been traced, we need to trace
+ * completion as well.
+ */
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ }
return true;
not_supported:
@@ -2478,7 +2505,7 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);
if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
- blk_stat_set_issue_time(&req->issue_stat);
+ blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
req->rq_flags |= RQF_STATS;
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
@@ -2601,6 +2628,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
@@ -2699,7 +2728,7 @@ void blk_finish_request(struct request *req, int error)
struct request_queue *q = req->q;
if (req->rq_flags & RQF_STATS)
- blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+ blk_stat_add(req);
if (req->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, req);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 0d5a9c1da1fc..4e951d3bf548 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq)
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to. If WAIT flag is not passed then caller may check only what
- * request was pushed in some internal queue for later handling.
+ * wish to.
*/
int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..b3622cb00fc2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
return 0;
}
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
diff --git a/block/blk-lib.c b/block/blk-lib.c
index ed1e78e24db0..e5b853f2b8a2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -109,7 +109,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @flags: BLKDEV_DISCARD_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f6d917977b33..4b3f962a9c7a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,6 +43,42 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
return ret;
}
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+ if (stat->nr_samples) {
+ seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ stat->nr_samples, stat->mean, stat->min, stat->max);
+ } else {
+ seq_puts(m, "samples=0");
+ }
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+ struct request_queue *q = m->private;
+
+ seq_puts(m, "read: ");
+ print_stat(m, &q->poll_stat[READ]);
+ seq_puts(m, "\n");
+
+ seq_puts(m, "write: ");
+ print_stat(m, &q->poll_stat[WRITE]);
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+ .open = queue_poll_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int hctx_state_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -322,60 +358,6 @@ static const struct file_operations hctx_io_poll_fops = {
.release = single_release,
};
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
- seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
- stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_rq_stat stat[2];
-
- blk_stat_init(&stat[BLK_STAT_READ]);
- blk_stat_init(&stat[BLK_STAT_WRITE]);
-
- blk_hctx_stat_get(hctx, stat);
-
- seq_puts(m, "read: ");
- print_stat(m, &stat[BLK_STAT_READ]);
- seq_puts(m, "\n");
-
- seq_puts(m, "write: ");
- print_stat(m, &stat[BLK_STAT_WRITE]);
- seq_puts(m, "\n");
- return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct seq_file *m = file->private_data;
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_mq_ctx *ctx;
- int i;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
- .open = hctx_stats_open,
- .read = seq_read,
- .write = hctx_stats_write,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int hctx_dispatched_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +618,11 @@ static const struct file_operations ctx_completed_fops = {
.release = single_release,
};
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+ {"poll_stat", 0400, &queue_poll_stat_fops},
+ {},
+};
+
static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"state", 0400, &hctx_state_fops},
{"flags", 0400, &hctx_flags_fops},
@@ -646,7 +633,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"sched_tags", 0400, &hctx_sched_tags_fops},
{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
{"io_poll", 0600, &hctx_io_poll_fops},
- {"stats", 0600, &hctx_stats_fops},
{"dispatched", 0600, &hctx_dispatched_fops},
{"queued", 0600, &hctx_queued_fops},
{"run", 0600, &hctx_run_fops},
@@ -753,6 +739,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
if (!q->mq_debugfs_dir)
goto err;
+ if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+ goto err;
+
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_debugfs_register_hctx(q, hctx))
goto err;
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 966c2169762e..0c3354cf3552 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -23,7 +23,7 @@
* @pdev: PCI device associated with @set.
*
* This function assumes the PCI device @pdev has at least as many available
- * interrupt vetors as @set has queues. It will then queuery the vector
+ * interrupt vectors as @set has queues. It will then query the vector
* corresponding to each queue for it's affinity mask and built queue mapping
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 572966f49596..724bcec0ca4f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,9 @@
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
@@ -65,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
@@ -75,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
blk_mq_run_hw_queues(q, false);
}
}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
{
@@ -105,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q)
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
@@ -431,15 +434,8 @@ static void blk_mq_ipi_complete_request(struct request *rq)
static void blk_mq_stat_add(struct request *rq)
{
if (rq->rq_flags & RQF_STATS) {
- /*
- * We could rq->mq_ctx here, but there's less of a risk
- * of races if we have the completion event add the stats
- * to the local software queue.
- */
- struct blk_mq_ctx *ctx;
-
- ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
- blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+ blk_mq_poll_stats_start(rq->q);
+ blk_stat_add(rq);
}
}
@@ -491,7 +487,7 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- blk_stat_set_issue_time(&rq->issue_stat);
+ blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
rq->rq_flags |= RQF_STATS;
wbt_issue(q->rq_wb, &rq->issue_stat);
}
@@ -526,6 +522,15 @@ void blk_mq_start_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_start_request);
+/*
+ * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * flag isn't set yet, so there may be race with timeout handler,
+ * but given rq->deadline is just set in .queue_rq() under
+ * this situation, the race won't be possible in reality because
+ * rq->timeout should be set as big enough to cover the window
+ * between blk_mq_start_request() called from .queue_rq() and
+ * clearing REQ_ATOM_STARTED here.
+ */
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -666,7 +671,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
* just be ignored. This can happen due to the bitflag ordering.
* Timeout first checks if STARTED is set, and if it is, assumes
* the request is active. But if we race with completion, then
- * we both flags will get cleared. So check here again, and ignore
+ * both flags will get cleared. So check here again, and ignore
* a timeout event with a request that isn't active.
*/
if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
@@ -699,6 +704,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return;
+ /*
+ * The rq being checked may have been freed and reallocated
+ * out already here, we avoid this race by checking rq->deadline
+ * and REQ_ATOM_COMPLETE flag together:
+ *
+ * - if rq->deadline is observed as new value because of
+ * reusing, the rq won't be timed out because of timing.
+ * - if rq->deadline is observed as previous value,
+ * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
+ * because we put a barrier between setting rq->deadline
+ * and clearing the flag in blk_mq_start_request(), so
+ * this rq won't be timed out too.
+ */
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
blk_mq_rq_timed_out(rq, reserved);
@@ -727,7 +745,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
* percpu_ref_tryget directly, because we need to be able to
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
- * blk_mq_freeze_queue_start, and the moment the last request is
+ * blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
@@ -964,20 +982,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
- LIST_HEAD(driver_list);
- struct list_head *dptr;
int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
if (list_empty(list))
return false;
/*
- * Start off with dptr being NULL, so we start the first request
- * immediately, even if we have more pending.
- */
- dptr = NULL;
-
- /*
* Now process all the entries, sending them to the driver.
*/
errors = queued = 0;
@@ -1009,7 +1019,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
list_del_init(&rq->queuelist);
bd.rq = rq;
- bd.list = dptr;
/*
* Flag last if we have no more requests, or if we have more
@@ -1045,13 +1054,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break;
-
- /*
- * We've done the first request. If we have more than 1
- * left in the list, set dptr to defer issue.
- */
- if (!dptr && list->next != list->prev)
- dptr = &driver_list;
} while (!list_empty(list));
hctx->dispatched[queued_to_index(queued)]++;
@@ -1104,6 +1106,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_dispatch_requests(hctx);
rcu_read_unlock();
} else {
+ might_sleep();
+
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
blk_mq_sched_dispatch_requests(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
@@ -1453,13 +1457,12 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
+static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
bool may_sleep)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
- .list = NULL,
.last = 1
};
struct blk_mq_hw_ctx *hctx;
@@ -1485,8 +1488,6 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
return;
}
- __blk_mq_requeue_request(rq);
-
if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
*cookie = BLK_QC_T_NONE;
rq->errors = -EIO;
@@ -1494,22 +1495,36 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
return;
}
+ __blk_mq_requeue_request(rq);
insert:
blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
}
-/*
- * Multiple hardware queue variant. This will not use per-process plugs,
- * but will attempt to bypass the hctx queueing if we can go straight to
- * hardware for SYNC IO.
- */
+static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, blk_qc_t *cookie)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ rcu_read_lock();
+ __blk_mq_try_issue_directly(rq, cookie, false);
+ rcu_read_unlock();
+ } else {
+ unsigned int srcu_idx;
+
+ might_sleep();
+
+ srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ __blk_mq_try_issue_directly(rq, cookie, true);
+ srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ }
+}
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = { .flags = 0 };
struct request *rq;
- unsigned int request_count = 0, srcu_idx;
+ unsigned int request_count = 0;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
@@ -1545,145 +1560,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
cookie = request_to_qc_t(data.hctx, rq);
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
plug = current->plug;
- /*
- * If the driver supports defer issued based on 'last', then
- * queue it up like normal since we can potentially save some
- * CPU this way.
- */
- if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
- !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
- struct request *old_rq = NULL;
-
+ if (unlikely(is_flush_fua)) {
blk_mq_bio_to_request(rq, bio);
-
- /*
- * We do limited plugging. If the bio can be merged, do that.
- * Otherwise the existing request in the plug list will be
- * issued. So the plug list will have one request at most
- */
- if (plug) {
- /*
- * The plug list might get flushed before this. If that
- * happens, same_queue_rq is invalid and plug list is
- * empty
- */
- if (same_queue_rq && !list_empty(&plug->mq_list)) {
- old_rq = same_queue_rq;
- list_del_init(&old_rq->queuelist);
- }
- list_add_tail(&rq->queuelist, &plug->mq_list);
- } else /* is_sync */
- old_rq = rq;
- blk_mq_put_ctx(data.ctx);
- if (!old_rq)
- goto done;
-
- if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_try_issue_directly(old_rq, &cookie, false);
- rcu_read_unlock();
+ if (q->elevator) {
+ blk_mq_sched_insert_request(rq, false, true, true,
+ true);
} else {
- srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
- blk_mq_try_issue_directly(old_rq, &cookie, true);
- srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(data.hctx, true);
}
- goto done;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
- /*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
- */
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
- blk_mq_put_ctx(data.ctx);
-done:
- return cookie;
-}
-
-/*
- * Single hardware queue variant. This will attempt to use any per-process
- * plug for merging and IO deferral.
- */
-static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
-{
- const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_plug *plug;
- unsigned int request_count = 0;
- struct blk_mq_alloc_data data = { .flags = 0 };
- struct request *rq;
- blk_qc_t cookie;
- unsigned int wb_acct;
-
- blk_queue_bounce(q, &bio);
-
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
-
- blk_queue_split(q, &bio, q->bio_split);
-
- if (!is_flush_fua && !blk_queue_nomerges(q)) {
- if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return BLK_QC_T_NONE;
- } else
- request_count = blk_plug_queued_count(q);
-
- if (blk_mq_sched_bio_merge(q, bio))
- return BLK_QC_T_NONE;
-
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
-
- trace_block_getrq(q, bio, bio->bi_opf);
-
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
- if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
- return BLK_QC_T_NONE;
- }
-
- wbt_track(&rq->issue_stat, wb_acct);
-
- cookie = request_to_qc_t(data.hctx, rq);
-
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
- /*
- * A task plug currently exists. Since this is completely lockless,
- * utilize that to temporarily store requests until the task is
- * either done or scheduled away.
- */
- plug = current->plug;
- if (plug) {
+ } else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
blk_mq_bio_to_request(rq, bio);
@@ -1694,13 +1581,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
*/
if (list_empty(&plug->mq_list))
request_count = 0;
+ else if (blk_queue_nomerges(q))
+ request_count = blk_plug_queued_count(q);
+
if (!request_count)
trace_block_plug(q);
else
last = list_entry_rq(plug->mq_list.prev);
- blk_mq_put_ctx(data.ctx);
-
if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
@@ -1708,30 +1596,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
}
list_add_tail(&rq->queuelist, &plug->mq_list);
- return cookie;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
+ } else if (plug && !blk_queue_nomerges(q)) {
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+
/*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
+ * We do limited plugging. If the bio can be merged, do that.
+ * Otherwise the existing request in the plug list will be
+ * issued. So the plug list will have one request at most
+ * The plug list might get flushed before this. If that happens,
+ * the plug list is empty, and same_queue_rq is invalid.
*/
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
+ if (list_empty(&plug->mq_list))
+ same_queue_rq = NULL;
+ if (same_queue_rq)
+ list_del_init(&same_queue_rq->queuelist);
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+
+ blk_mq_put_ctx(data.ctx);
+
+ if (same_queue_rq)
+ blk_mq_try_issue_directly(data.hctx, same_queue_rq,
+ &cookie);
+
+ return cookie;
+ } else if (q->nr_hw_queues > 1 && is_sync) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+ return cookie;
+ } else if (q->elevator) {
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true, true, true);
+ } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio))
+ blk_mq_run_hw_queue(data.hctx, true);
blk_mq_put_ctx(data.ctx);
-done:
return cookie;
}
@@ -2067,8 +1966,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
- blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -2364,6 +2261,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+ blk_stat_rq_ddir, 2, q);
+ if (!q->poll_cb)
+ goto err_exit;
+
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
goto err_exit;
@@ -2398,10 +2300,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- if (q->nr_hw_queues > 1)
- blk_queue_make_request(q, blk_mq_make_request);
- else
- blk_queue_make_request(q, blk_sq_make_request);
+ blk_queue_make_request(q, blk_mq_make_request);
/*
* Do this after blk_queue_make_request() overrides it...
@@ -2456,8 +2355,6 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
- wbt_exit(q);
-
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2502,7 +2399,7 @@ static void blk_mq_queue_reinit_work(void)
* take place in parallel.
*/
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
@@ -2755,16 +2652,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
-
- /*
- * Manually set the make_request_fn as blk_queue_make_request
- * resets a lot of the queue settings.
- */
- if (q->nr_hw_queues > 1)
- q->make_request_fn = blk_mq_make_request;
- else
- q->make_request_fn = blk_sq_make_request;
-
blk_mq_queue_reinit(q, cpu_online_mask);
}
@@ -2773,28 +2660,53 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ return true;
+ blk_stat_add_callback(q, q->poll_cb);
+ return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+ /*
+ * We don't arm the callback if polling stats are not enabled or the
+ * callback is already active.
+ */
+ if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ blk_stat_is_active(q->poll_cb))
+ return;
+
+ blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+ struct request_queue *q = cb->data;
+
+ if (cb->stat[READ].nr_samples)
+ q->poll_stat[READ] = cb->stat[READ];
+ if (cb->stat[WRITE].nr_samples)
+ q->poll_stat[WRITE] = cb->stat[WRITE];
+}
+
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_rq_stat stat[2];
unsigned long ret = 0;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
- if (!blk_stat_enable(q))
+ if (!blk_poll_stats_enable(q))
return 0;
/*
- * We don't have to do this once per IO, should optimize this
- * to just use the current window of stats until it changes
- */
- memset(&stat, 0, sizeof(stat));
- blk_hctx_stat_get(hctx, stat);
-
- /*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
@@ -2802,10 +2714,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
* important on devices where the completion latencies are longer
* than ~10 usec.
*/
- if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
- ret = (stat[BLK_STAT_READ].mean + 1) / 2;
- else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
- ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+ if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples)
+ ret = (q->poll_stat[READ].mean + 1) / 2;
+ else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples)
+ ret = (q->poll_stat[WRITE].mean + 1) / 2;
return ret;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 660a17e1d033..7e6f2e467696 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
- struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 186fcb981e9b..e77ec52f5bb5 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,10 +4,33 @@
* Copyright (C) 2016 Jens Axboe
*/
#include <linux/kernel.h>
+#include <linux/rculist.h>
#include <linux/blk-mq.h>
#include "blk-stat.h"
#include "blk-mq.h"
+#include "blk.h"
+
+#define BLK_RQ_STAT_BATCH 64
+
+struct blk_queue_stats {
+ struct list_head callbacks;
+ spinlock_t lock;
+ bool enable_accounting;
+};
+
+unsigned int blk_stat_rq_ddir(const struct request *rq)
+{
+ return rq_data_dir(rq);
+}
+EXPORT_SYMBOL_GPL(blk_stat_rq_ddir);
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+}
static void blk_stat_flush_batch(struct blk_rq_stat *stat)
{
@@ -48,209 +71,183 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
dst->nr_samples += src->nr_samples;
}
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- uint64_t latest = 0;
- int i, j, nr;
-
- blk_stat_init(&dst[BLK_STAT_READ]);
- blk_stat_init(&dst[BLK_STAT_WRITE]);
-
- nr = 0;
- do {
- uint64_t newest = 0;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
-
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
- }
- }
+ stat->min = min(stat->min, value);
+ stat->max = max(stat->max, value);
- /*
- * No samples
- */
- if (!newest)
- break;
-
- if (newest > latest)
- latest = newest;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare.
- */
- } while (!nr);
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
- dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+ stat->batch += value;
+ stat->nr_batch++;
}
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
{
- if (q->mq_ops)
- blk_mq_stat_get(q, dst);
- else {
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
- memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
- sizeof(struct blk_rq_stat));
- memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
- sizeof(struct blk_rq_stat));
+ struct request_queue *q = rq->q;
+ struct blk_stat_callback *cb;
+ struct blk_rq_stat *stat;
+ int bucket;
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ value = now - blk_stat_time(&rq->issue_stat);
+
+ blk_throtl_stat_add(rq, value);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+ if (blk_stat_is_active(cb)) {
+ bucket = cb->bucket_fn(rq);
+ stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+ __blk_stat_add(stat, value);
+ }
}
+ rcu_read_unlock();
}
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
{
- struct blk_mq_ctx *ctx;
- unsigned int i, nr;
-
- nr = 0;
- do {
- uint64_t newest = 0;
+ struct blk_stat_callback *cb = (void *)data;
+ unsigned int bucket;
+ int cpu;
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cb->stat[bucket]);
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++) {
+ blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+ blk_stat_init(&cpu_stat[bucket]);
}
+ }
- if (!newest)
- break;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare, as the window is only updated
- * occasionally
- */
- } while (!nr);
+ cb->timer_fn(cb);
}
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ unsigned int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data)
{
- stat->min = -1ULL;
- stat->max = stat->nr_samples = stat->mean = 0;
- stat->batch = stat->nr_batch = 0;
- stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+ struct blk_stat_callback *cb;
-void blk_stat_init(struct blk_rq_stat *stat)
-{
- __blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+ cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+ if (!cb)
+ return NULL;
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
- return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+ cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+ GFP_KERNEL);
+ if (!cb->stat) {
+ kfree(cb);
+ return NULL;
+ }
+ cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat));
+ if (!cb->cpu_stat) {
+ kfree(cb->stat);
+ kfree(cb);
+ return NULL;
+ }
+
+ cb->timer_fn = timer_fn;
+ cb->bucket_fn = bucket_fn;
+ cb->data = data;
+ cb->buckets = buckets;
+ setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+ return cb;
}
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
-bool blk_stat_is_current(struct blk_rq_stat *stat)
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+ unsigned int bucket;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
+
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cpu_stat[bucket]);
+ }
+
+ spin_lock(&q->stats->lock);
+ list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- s64 now, value;
+ spin_lock(&q->stats->lock);
+ list_del_rcu(&cb->list);
+ if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+ clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
- if (now < blk_stat_time(&rq->issue_stat))
- return;
-
- if (!__blk_stat_is_current(stat, now))
- __blk_stat_init(stat, now);
+ del_timer_sync(&cb->timer);
+}
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
- value = now - blk_stat_time(&rq->issue_stat);
- if (value > stat->max)
- stat->max = value;
- if (value < stat->min)
- stat->min = value;
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
+{
+ struct blk_stat_callback *cb;
- if (stat->batch + value < stat->batch ||
- stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
- blk_stat_flush_batch(stat);
+ cb = container_of(head, struct blk_stat_callback, rcu);
+ free_percpu(cb->cpu_stat);
+ kfree(cb->stat);
+ kfree(cb);
+}
- stat->batch += value;
- stat->nr_batch++;
+void blk_stat_free_callback(struct blk_stat_callback *cb)
+{
+ if (cb)
+ call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
-void blk_stat_clear(struct request_queue *q)
+void blk_stat_enable_accounting(struct request_queue *q)
{
- if (q->mq_ops) {
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- int i, j;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- }
- } else {
- blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
- }
+ spin_lock(&q->stats->lock);
+ q->stats->enable_accounting = true;
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
{
- stat->time = (stat->time & BLK_STAT_MASK) |
- (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+ struct blk_queue_stats *stats;
+
+ stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+ if (!stats)
+ return NULL;
+
+ INIT_LIST_HEAD(&stats->callbacks);
+ spin_lock_init(&stats->lock);
+ stats->enable_accounting = false;
+
+ return stats;
}
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+void blk_free_queue_stats(struct blk_queue_stats *stats)
{
- if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
- return false;
- }
+ if (!stats)
+ return;
+
+ WARN_ON(!list_empty(&stats->callbacks));
- return true;
+ kfree(stats);
}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index a2050a0a5314..53f08a63bf15 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,33 +1,84 @@
#ifndef BLK_STAT_H
#define BLK_STAT_H
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC 134217728ULL
-#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
/*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
*/
#define BLK_STAT_RES_BITS 3
-#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS 12
+#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK \
+ (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
+
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+ /*
+ * @list: RCU list of callbacks for a &struct request_queue.
+ */
+ struct list_head list;
+
+ /**
+ * @timer: Timer for the next callback invocation.
+ */
+ struct timer_list timer;
+
+ /**
+ * @cpu_stat: Per-cpu statistics buckets.
+ */
+ struct blk_rq_stat __percpu *cpu_stat;
-enum {
- BLK_STAT_READ = 0,
- BLK_STAT_WRITE,
+ /**
+ * @bucket_fn: Given a request, returns which statistics bucket it
+ * should be accounted under.
+ */
+ unsigned int (*bucket_fn)(const struct request *);
+
+ /**
+ * @buckets: Number of statistics buckets.
+ */
+ unsigned int buckets;
+
+ /**
+ * @stat: Array of statistics buckets.
+ */
+ struct blk_rq_stat *stat;
+
+ /**
+ * @fn: Callback function.
+ */
+ void (*timer_fn)(struct blk_stat_callback *);
+
+ /**
+ * @data: Private pointer for the user.
+ */
+ void *data;
+
+ struct rcu_head rcu;
};
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
static inline u64 __blk_stat_time(u64 time)
{
@@ -36,7 +87,128 @@ static inline u64 __blk_stat_time(u64 time)
static inline u64 blk_stat_time(struct blk_issue_stat *stat)
{
- return __blk_stat_time(stat->time);
+ return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+ return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+ return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+ sector_t size)
+{
+ stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+ (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
+}
+
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
+/*
+ * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
+ * @rq: Request.
+ *
+ * This is the same as rq_data_dir() but as a function so it can be used as
+ * @bucket_fn for blk_stat_alloc_callback().
+ *
+ * Return: Data direction of the request, either READ or WRITE.
+ */
+unsigned int blk_stat_rq_ddir(const struct request *rq);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ unsigned int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+ return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+ u64 nsecs)
+{
+ mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+ unsigned int msecs)
+{
+ mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 37f0b3ad635e..c47db43a40cc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
- return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- pre, (long long) stat->nr_samples,
- (long long) stat->mean, (long long) stat->min,
- (long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
- struct blk_rq_stat stat[2];
- ssize_t ret;
-
- blk_queue_stat_get(q, stat);
-
- ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
- ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
- return ret;
-}
-
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
-static struct queue_sysfs_entry queue_stats_entry = {
- .attr = {.name = "stats", .mode = S_IRUGO },
- .show = queue_stats_show,
-};
-
static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
.show = queue_wb_lat_show,
.store = queue_wb_lat_store,
};
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static struct queue_sysfs_entry throtl_sample_time_entry = {
+ .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_throtl_sample_time_show,
+ .store = blk_throtl_sample_time_store,
+};
+#endif
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
- &queue_stats_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ &throtl_sample_time_entry.attr,
+#endif
NULL,
};
@@ -810,7 +795,9 @@ static void blk_release_queue(struct kobject *kobj)
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
- wbt_exit(q);
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ blk_stat_remove_callback(q, q->poll_cb);
+ blk_stat_free_callback(q->poll_cb);
bdi_put(q->backing_dev_info);
blkcg_exit_queue(q);
@@ -819,6 +806,8 @@ static void blk_release_queue(struct kobject *kobj)
elevator_exit(q, q->elevator);
}
+ blk_free_queue_stats(q->stats);
+
blk_exit_rl(&q->root_rl);
if (q->queue_tags)
@@ -881,6 +870,11 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
+ WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+ "%s is registering an already registered queue\n",
+ kobject_name(&dev->kobj));
+ queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
/*
* SCSI probing may synchronously create and destroy a lot of
* request_queues for non-existent devices. Shutting down a fully
@@ -916,6 +910,8 @@ int blk_register_queue(struct gendisk *disk)
blk_wb_init(q);
+ blk_throtl_register_queue(q);
+
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
@@ -939,6 +935,11 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
+ queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
+ wbt_exit(q);
+
+
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8fab716e4059..c82bf9b1fe72 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8;
/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;
-/* Throttling is performed over 100ms slice and after that slice is renewed */
-static unsigned long throtl_slice = HZ/10; /* 100 ms */
+/* Throttling is performed over a slice and after that slice is renewed */
+#define DFL_THROTL_SLICE_HD (HZ / 10)
+#define DFL_THROTL_SLICE_SSD (HZ / 50)
+#define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
+/* default latency target is 0, eg, guarantee IO latency by default */
+#define DFL_LATENCY_TARGET (0)
+
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
static struct blkcg_policy blkcg_policy_throtl;
@@ -83,6 +92,12 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
+enum {
+ LIMIT_LOW,
+ LIMIT_MAX,
+ LIMIT_CNT,
+};
+
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -119,20 +134,54 @@ struct throtl_grp {
/* are there any throtl rules between this group and td? */
bool has_rules[2];
- /* bytes per second rate limits */
- uint64_t bps[2];
+ /* internally used bytes per second rate limits */
+ uint64_t bps[2][LIMIT_CNT];
+ /* user configured bps limits */
+ uint64_t bps_conf[2][LIMIT_CNT];
- /* IOPS limits */
- unsigned int iops[2];
+ /* internally used IOPS limits */
+ unsigned int iops[2][LIMIT_CNT];
+ /* user configured IOPS limits */
+ unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes disptached in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
+ unsigned long last_low_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ unsigned long last_check_time;
+
+ unsigned long latency_target; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
+
+ unsigned long last_finish_time; /* ns / 1024 */
+ unsigned long checked_last_finish_time; /* ns / 1024 */
+ unsigned long avg_idletime; /* ns / 1024 */
+ unsigned long idletime_threshold; /* us */
+
+ unsigned int bio_cnt; /* total bios */
+ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+ unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+ unsigned long total_latency; /* ns / 1024 */
+ int samples;
+};
+
+struct avg_latency_bucket {
+ unsigned long latency; /* ns / 1024 */
+ bool valid;
};
struct throtl_data
@@ -145,8 +194,26 @@ struct throtl_data
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
+ unsigned int throtl_slice;
+
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
+ unsigned int limit_index;
+ bool limit_valid[LIMIT_CNT];
+
+ unsigned long dft_idletime_threshold; /* us */
+
+ unsigned long low_upgrade_time;
+ unsigned long low_downgrade_time;
+
+ unsigned int scale;
+
+ struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets;
+ unsigned long last_calculate_time;
+
+ bool track_bio_latency;
};
static void throtl_pending_timer_fn(unsigned long arg);
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
+/*
+ * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
+ * make the IO dispatch more smooth.
+ * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * every throtl_slice, the limit scales up 1/2 .low limit till the
+ * limit hits .max limit
+ * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
+ */
+static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
+{
+ /* arbitrary value to avoid too big scale */
+ if (td->scale < 4096 && time_after_eq(jiffies,
+ td->low_upgrade_time + td->scale * td->throtl_slice))
+ td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
+
+ return low + (low >> 1) * td->scale;
+}
+
+static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ uint64_t ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return U64_MAX;
+
+ td = tg->td;
+ ret = tg->bps[rw][td->limit_index];
+ if (ret == 0 && td->limit_index == LIMIT_LOW)
+ return tg->bps[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
+ tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
+ ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ unsigned int ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return UINT_MAX;
+ td = tg->td;
+ ret = tg->iops[rw][td->limit_index];
+ if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
+ return tg->iops[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
+ tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
+ if (adjusted > UINT_MAX)
+ adjusted = UINT_MAX;
+ ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+#define request_bucket_index(sectors) \
+ clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ] = -1;
- tg->bps[WRITE] = -1;
- tg->iops[READ] = -1;
- tg->iops[WRITE] = -1;
+ tg->bps[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
+ tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+ /* LIMIT_LOW will have default value 0 */
+
+ tg->latency_target = DFL_LATENCY_TARGET;
return &tg->pd;
}
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
}
/*
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+ struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
- (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+ (td->limit_valid[td->limit_index] &&
+ (tg_bps_limit(tg, rw) != U64_MAX ||
+ tg_iops_limit(tg, rw) != UINT_MAX));
}
static void throtl_pd_online(struct blkg_policy_data *pd)
{
+ struct throtl_grp *tg = pd_to_tg(pd);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(pd_to_tg(pd));
+ tg_update_has_rules(tg);
+}
+
+static void blk_throtl_update_limit_valid(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+ bool low_valid = false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
+ tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ low_valid = true;
+ }
+ rcu_read_unlock();
+
+ td->limit_valid[LIMIT_LOW] = low_valid;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td);
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+
+ tg->bps[READ][LIMIT_LOW] = 0;
+ tg->bps[WRITE][LIMIT_LOW] = 0;
+ tg->iops[READ][LIMIT_LOW] = 0;
+ tg->iops[WRITE][LIMIT_LOW] = 0;
+
+ blk_throtl_update_limit_valid(tg->td);
+
+ if (!tg->td->limit_valid[tg->td->limit_index])
+ throtl_upgrade_state(tg->td);
}
static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires)
{
+ unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+
+ /*
+ * Since we are adjusting the throttle limit dynamically, the sleep
+ * time calculated according to previous limit might be invalid. It's
+ * possible the cgroup sleep time is very long and no other cgroups
+ * have IO running so notify the limit changes. Make sure the cgroup
+ * doesn't sleep too long to avoid the missed notification.
+ */
+ if (time_after(expires, max_expire))
+ expires = max_expire;
mod_timer(&sq->pending_timer, expires);
throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
expires - jiffies, jiffies);
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
if (time_after_eq(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
}
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start.
*/
- throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw];
- nr_slices = time_elapsed / throtl_slice;
+ nr_slices = time_elapsed / tg->td->throtl_slice;
if (!nr_slices)
return;
- tmp = tg->bps[rw] * throtl_slice * nr_slices;
+ tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
do_div(tmp, HZ);
bytes_trim = tmp;
- io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
+ io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
+ HZ;
if (!bytes_trim && !io_trim)
return;
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
else
tg->io_disp[rw] = 0;
- tg->slice_start[rw] += nr_slices * throtl_slice;
+ tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
* have been trimmed.
*/
- tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+ tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
/* Calc approx time to dispatch */
- jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+ jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
if (jiffy_wait > jiffy_elapsed)
jiffy_wait = jiffy_wait - jiffy_elapsed;
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+ tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
bytes_allowed = tmp;
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+ jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
if (!jiffy_wait)
jiffy_wait = 1;
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+ if (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX) {
if (wait)
*wait = 0;
return true;
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw);
else {
- if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
- throtl_extend_slice(tg, rw, jiffies + throtl_slice);
+ if (time_before(tg->slice_end[rw],
+ jiffies + tg->td->throtl_slice))
+ throtl_extend_slice(tg, rw,
+ jiffies + tg->td->throtl_slice);
}
if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
/* Charge the bio to the group */
tg->bytes_disp[rw] += bio->bi_iter.bi_size;
tg->io_disp[rw]++;
+ tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->last_io_disp[rw]++;
/*
* BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @arg: the throtl_service_queue being serviced
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
int ret;
spin_lock_irq(q->queue_lock);
+ if (throtl_can_upgrade(td, NULL))
+ throtl_upgrade_state(td);
+
again:
parent_sq = sq->parent_sq;
dispatched = false;
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
u64 v = *(u64 *)((void *)tg + off);
- if (v == -1)
+ if (v == U64_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
unsigned int v = *(unsigned int *)((void *)tg + off);
- if (v == -1)
+ if (v == UINT_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg)
throtl_log(&tg->service_queue,
"limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
- tg->bps[READ], tg->bps[WRITE],
- tg->iops[READ], tg->iops[WRITE]);
+ tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
+ tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
/*
* Update has_rules[] flags for the updated tg's subtree. A tg is
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
if (sscanf(ctx.body, "%llu", &v) != 1)
goto out_finish;
if (!v)
- v = -1;
+ v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ]),
+ .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE]),
+ .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ]),
+ .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE]),
+ .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = {
{ } /* terminate */
};
-static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
char bufs[4][21] = { "max", "max", "max", "max" };
+ u64 bps_dft;
+ unsigned int iops_dft;
+ char idle_time[26] = "";
+ char latency_time[26] = "";
if (!dname)
return 0;
- if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
- tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+
+ if (off == LIMIT_LOW) {
+ bps_dft = 0;
+ iops_dft = 0;
+ } else {
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
+ }
+
+ if (tg->bps_conf[READ][off] == bps_dft &&
+ tg->bps_conf[WRITE][off] == bps_dft &&
+ tg->iops_conf[READ][off] == iops_dft &&
+ tg->iops_conf[WRITE][off] == iops_dft &&
+ (off != LIMIT_LOW ||
+ (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
+ tg->latency_target == DFL_LATENCY_TARGET)))
return 0;
- if (tg->bps[READ] != -1)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
- if (tg->bps[WRITE] != -1)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
- if (tg->iops[READ] != -1)
- snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
- if (tg->iops[WRITE] != -1)
- snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
-
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ if (tg->bps_conf[READ][off] != bps_dft)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu",
+ tg->bps_conf[READ][off]);
+ if (tg->bps_conf[WRITE][off] != bps_dft)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu",
+ tg->bps_conf[WRITE][off]);
+ if (tg->iops_conf[READ][off] != iops_dft)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u",
+ tg->iops_conf[READ][off]);
+ if (tg->iops_conf[WRITE][off] != iops_dft)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u",
+ tg->iops_conf[WRITE][off]);
+ if (off == LIMIT_LOW) {
+ if (tg->idletime_threshold == ULONG_MAX)
+ strcpy(idle_time, " idle=max");
+ else
+ snprintf(idle_time, sizeof(idle_time), " idle=%lu",
+ tg->idletime_threshold);
+
+ if (tg->latency_target == ULONG_MAX)
+ strcpy(latency_time, " latency=max");
+ else
+ snprintf(latency_time, sizeof(latency_time),
+ " latency=%lu", tg->latency_target);
+ }
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
+ latency_time);
return 0;
}
-static int tg_print_max(struct seq_file *sf, void *v)
+static int tg_print_limit(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
-static ssize_t tg_set_max(struct kernfs_open_file *of,
+static ssize_t tg_set_limit(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
+ unsigned long idle_time;
+ unsigned long latency_time;
int ret;
+ int index = of_cft(of)->private;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
if (ret)
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
- v[0] = tg->bps[READ];
- v[1] = tg->bps[WRITE];
- v[2] = tg->iops[READ];
- v[3] = tg->iops[WRITE];
+ v[0] = tg->bps_conf[READ][index];
+ v[1] = tg->bps_conf[WRITE][index];
+ v[2] = tg->iops_conf[READ][index];
+ v[3] = tg->iops_conf[WRITE][index];
+ idle_time = tg->idletime_threshold;
+ latency_time = tg->latency_target;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
- u64 val = -1;
+ u64 val = U64_MAX;
int len;
if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
v[2] = min_t(u64, val, UINT_MAX);
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
+ else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+ idle_time = val;
+ else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
+ latency_time = val;
else
goto out_finish;
}
- tg->bps[READ] = v[0];
- tg->bps[WRITE] = v[1];
- tg->iops[READ] = v[2];
- tg->iops[WRITE] = v[3];
+ tg->bps_conf[READ][index] = v[0];
+ tg->bps_conf[WRITE][index] = v[1];
+ tg->iops_conf[READ][index] = v[2];
+ tg->iops_conf[WRITE][index] = v[3];
+ if (index == LIMIT_MAX) {
+ tg->bps[READ][index] = v[0];
+ tg->bps[WRITE][index] = v[1];
+ tg->iops[READ][index] = v[2];
+ tg->iops[WRITE][index] = v[3];
+ }
+ tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
+ tg->bps_conf[READ][LIMIT_MAX]);
+ tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
+ tg->bps_conf[WRITE][LIMIT_MAX]);
+ tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
+ tg->iops_conf[READ][LIMIT_MAX]);
+ tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
+ tg->iops_conf[WRITE][LIMIT_MAX]);
+
+ if (index == LIMIT_LOW) {
+ blk_throtl_update_limit_valid(tg->td);
+ if (tg->td->limit_valid[LIMIT_LOW])
+ tg->td->limit_index = LIMIT_LOW;
+ tg->idletime_threshold = (idle_time == ULONG_MAX) ?
+ ULONG_MAX : idle_time;
+ tg->latency_target = (latency_time == ULONG_MAX) ?
+ ULONG_MAX : latency_time;
+ }
tg_conf_updated(tg);
ret = 0;
out_finish:
@@ -1365,11 +1641,21 @@ out_finish:
}
static struct cftype throtl_files[] = {
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_LOW,
+ },
+#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_max,
- .write = tg_set_max,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1388,9 +1674,362 @@ static struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
+ .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ unsigned long rtime = jiffies, wtime = jiffies;
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
+ rtime = tg->last_low_overflow_time[READ];
+ if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ wtime = tg->last_low_overflow_time[WRITE];
+ return min(rtime, wtime);
+}
+
+/* tg should not be an intermediate node */
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *parent_sq;
+ struct throtl_grp *parent = tg;
+ unsigned long ret = __tg_last_low_overflow_time(tg);
+
+ while (true) {
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ if (!parent)
+ break;
+
+ /*
+ * The parent doesn't have low limit, it always reaches low
+ * limit. Its overflow time is useless for children
+ */
+ if (!parent->bps[READ][LIMIT_LOW] &&
+ !parent->iops[READ][LIMIT_LOW] &&
+ !parent->bps[WRITE][LIMIT_LOW] &&
+ !parent->iops[WRITE][LIMIT_LOW])
+ continue;
+ if (time_after(__tg_last_low_overflow_time(parent), ret))
+ ret = __tg_last_low_overflow_time(parent);
+ }
+ return ret;
+}
+
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+ /*
+ * cgroup is idle if:
+ * - single idle is too long, longer than a fixed value (in case user
+ * configure a too big threshold) or 4 times of slice
+ * - average think time is more than threshold
+ * - IO latency is largely below threshold
+ */
+ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+ time = min_t(unsigned long, MAX_IDLE_TIME, time);
+ return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+ tg->avg_idletime > tg->idletime_threshold ||
+ (tg->latency_target && tg->bio_cnt &&
+ tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ bool read_limit, write_limit;
+
+ /*
+ * if cgroup reaches low limit (if low limit is 0, the cgroup always
+ * reaches), it's ok to upgrade to next limit
+ */
+ read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+ write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+ if (!read_limit && !write_limit)
+ return true;
+ if (read_limit && sq->nr_queued[READ] &&
+ (!write_limit || sq->nr_queued[WRITE]))
+ return true;
+ if (write_limit && sq->nr_queued[WRITE] &&
+ (!read_limit || sq->nr_queued[READ]))
+ return true;
+
+ if (time_after_eq(jiffies,
+ tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+ throtl_tg_is_idle(tg))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (throtl_tg_can_upgrade(tg))
+ return true;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ return false;
+ }
+ return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ if (td->limit_index != LIMIT_LOW)
+ return false;
+
+ if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
+ return false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg == this_tg)
+ continue;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ continue;
+ if (!throtl_hierarchy_can_upgrade(tg)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+ return true;
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_LOW)
+ return;
+
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ tg->last_check_time = now;
+
+ if (!time_after_eq(now,
+ __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+ return;
+
+ if (throtl_can_upgrade(tg->td, NULL))
+ throtl_upgrade_state(tg->td);
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->scale = 0;
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ tg->disptime = jiffies - 1;
+ throtl_select_dispatch(sq);
+ throtl_schedule_next_dispatch(sq, false);
+ }
+ rcu_read_unlock();
+ throtl_select_dispatch(&td->service_queue);
+ throtl_schedule_next_dispatch(&td->service_queue, false);
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+ td->scale /= 2;
+
+ if (td->scale) {
+ td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+ return;
+ }
+
+ td->limit_index = new;
+ td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+ struct throtl_data *td = tg->td;
+ unsigned long now = jiffies;
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
+ time_after_eq(now, tg_last_low_overflow_time(tg) +
+ td->throtl_slice) &&
+ (!throtl_tg_is_idle(tg) ||
+ !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (!throtl_tg_can_downgrade(tg))
+ return false;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ break;
+ }
+ return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+ uint64_t bps;
+ unsigned int iops;
+ unsigned long elapsed_time;
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_MAX ||
+ !tg->td->limit_valid[LIMIT_LOW])
+ return;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ return;
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ elapsed_time = now - tg->last_check_time;
+ tg->last_check_time = now;
+
+ if (time_before(now, tg_last_low_overflow_time(tg) +
+ tg->td->throtl_slice))
+ return;
+
+ if (tg->bps[READ][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[READ] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->bps[WRITE][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[WRITE] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ if (tg->iops[READ][LIMIT_LOW]) {
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+ if (iops >= tg->iops[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->iops[WRITE][LIMIT_LOW]) {
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+ if (iops >= tg->iops[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (throtl_hierarchy_can_downgrade(tg))
+ throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+ tg->last_bytes_disp[READ] = 0;
+ tg->last_bytes_disp[WRITE] = 0;
+ tg->last_io_disp[READ] = 0;
+ tg->last_io_disp[WRITE] = 0;
+}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+ unsigned long now = ktime_get_ns() >> 10;
+ unsigned long last_finish_time = tg->last_finish_time;
+
+ if (now <= last_finish_time || last_finish_time == 0 ||
+ last_finish_time == tg->checked_last_finish_time)
+ return;
+
+ tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+ tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+ struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+ int i, cpu;
+ unsigned long last_latency = 0;
+ unsigned long latency;
+
+ if (!blk_queue_nonrot(td->queue))
+ return;
+ if (time_before(jiffies, td->last_calculate_time + HZ))
+ return;
+ td->last_calculate_time = jiffies;
+
+ memset(avg_latency, 0, sizeof(avg_latency));
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets, cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
+
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
+
+ latency = tmp->total_latency;
+
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency /= samples;
+ if (latency == 0)
+ continue;
+ avg_latency[i].latency = latency;
+ }
+ }
+
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[i].latency) {
+ if (td->avg_buckets[i].latency < last_latency)
+ td->avg_buckets[i].latency = last_latency;
+ continue;
+ }
+
+ if (!td->avg_buckets[i].valid)
+ latency = avg_latency[i].latency;
+ else
+ latency = (td->avg_buckets[i].latency * 7 +
+ avg_latency[i].latency) >> 3;
+
+ td->avg_buckets[i].latency = max(latency, last_latency);
+ td->avg_buckets[i].valid = true;
+ last_latency = td->avg_buckets[i].latency;
+ }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
@@ -1399,6 +2038,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
+ struct throtl_data *td = tg->td;
+ int ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1408,19 +2049,40 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
spin_lock_irq(q->queue_lock);
+ throtl_update_latency_buckets(td);
+
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
+ ret = bio_associate_current(bio);
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ if (ret == 0 || ret == -EBUSY)
+ bio->bi_cg_private = tg;
+ blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#endif
+ blk_throtl_update_idletime(tg);
+
sq = &tg->service_queue;
+again:
while (true) {
+ if (tg->last_low_overflow_time[rw] == 0)
+ tg->last_low_overflow_time[rw] = jiffies;
+ throtl_downgrade_check(tg);
+ throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
/* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL))
+ if (!tg_may_dispatch(tg, bio, NULL)) {
+ tg->last_low_overflow_time[rw] = jiffies;
+ if (throtl_can_upgrade(td, tg)) {
+ throtl_upgrade_state(td);
+ goto again;
+ }
break;
+ }
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
@@ -1453,12 +2115,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
/* out-of-limit, queue to @tg */
throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
- tg->io_disp[rw], tg->iops[rw],
+ tg->bytes_disp[rw], bio->bi_iter.bi_size,
+ tg_bps_limit(tg, rw),
+ tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
- bio_associate_current(bio);
- tg->td->nr_queued[rw]++;
+ tg->last_low_overflow_time[rw] = jiffies;
+
+ td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
@@ -1483,9 +2147,94 @@ out:
*/
if (!throttled)
bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ if (throttled || !td->track_bio_latency)
+ bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
return throttled;
}
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+ int op, unsigned long time)
+{
+ struct latency_bucket *latency;
+ int index;
+
+ if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ !blk_queue_nonrot(td->queue))
+ return;
+
+ index = request_bucket_index(size);
+
+ latency = get_cpu_ptr(td->latency_buckets);
+ latency[index].total_latency += time;
+ latency[index].samples++;
+ put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+ struct request_queue *q = rq->q;
+ struct throtl_data *td = q->td;
+
+ throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+ req_op(rq), time_ns >> 10);
+}
+
+void blk_throtl_bio_endio(struct bio *bio)
+{
+ struct throtl_grp *tg;
+ u64 finish_time_ns;
+ unsigned long finish_time;
+ unsigned long start_time;
+ unsigned long lat;
+
+ tg = bio->bi_cg_private;
+ if (!tg)
+ return;
+ bio->bi_cg_private = NULL;
+
+ finish_time_ns = ktime_get_ns();
+ tg->last_finish_time = finish_time_ns >> 10;
+
+ start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+ finish_time = __blk_stat_time(finish_time_ns) >> 10;
+ if (!start_time || finish_time <= start_time)
+ return;
+
+ lat = finish_time - start_time;
+ /* this is only for bio based driver */
+ if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+ throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+ bio_op(bio), lat);
+
+ if (tg->latency_target) {
+ int bucket;
+ unsigned int threshold;
+
+ bucket = request_bucket_index(
+ blk_stat_size(&bio->bi_issue_stat));
+ threshold = tg->td->avg_buckets[bucket].latency +
+ tg->latency_target;
+ if (lat > threshold)
+ tg->bad_bio_cnt++;
+ /*
+ * Not race free, could get wrong count, which means cgroups
+ * will be throttled
+ */
+ tg->bio_cnt++;
+ }
+
+ if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+ tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+ tg->bio_cnt /= 2;
+ tg->bad_bio_cnt /= 2;
+ }
+}
+#endif
+
/*
* Dispatch all bios from all children tg's queued on @parent_sq. On
* return, @parent_sq is guaranteed to not have any active children tg's
@@ -1558,6 +2307,12 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
+ td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets) {
+ kfree(td);
+ return -ENOMEM;
+ }
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
@@ -1565,10 +2320,17 @@ int blk_throtl_init(struct request_queue *q)
q->td = td;
td->queue = q;
+ td->limit_valid[LIMIT_MAX] = true;
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->low_downgrade_time = jiffies;
+
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
- if (ret)
+ if (ret) {
+ free_percpu(td->latency_buckets);
kfree(td);
+ }
return ret;
}
@@ -1577,9 +2339,74 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ free_percpu(q->td->latency_buckets);
kfree(q->td);
}
+void blk_throtl_register_queue(struct request_queue *q)
+{
+ struct throtl_data *td;
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td = q->td;
+ BUG_ON(!td);
+
+ if (blk_queue_nonrot(q)) {
+ td->throtl_slice = DFL_THROTL_SLICE_SSD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+ } else {
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+ }
+#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
+ /* if no low limit, use previous default */
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+#endif
+
+ td->track_bio_latency = !q->mq_ops && !q->request_fn;
+ if (!td->track_bio_latency)
+ blk_stat_enable_accounting(q);
+
+ /*
+ * some tg are created before queue is fully initialized, eg, nonrot
+ * isn't initialized yet
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
+ }
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
+{
+ if (!q->td)
+ return -EINVAL;
+ return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+}
+
+ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long v;
+ unsigned long t;
+
+ if (!q->td)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &v))
+ return -EINVAL;
+ t = msecs_to_jiffies(v);
+ if (t == 0 || t > MAX_THROTL_SLICE)
+ return -EINVAL;
+ q->td->throtl_slice = t;
+ return count;
+}
+#endif
+
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1aedb1f7ee0c..ffa80e11cf14 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
* that it's writes impacting us, and not just some sole read on
* a device that is in a lower power state.
*/
- return stat[BLK_STAT_READ].nr_samples >= 1 &&
- stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+ return (stat[READ].nr_samples >= 1 &&
+ stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -277,7 +277,7 @@ enum {
LAT_EXCEEDED,
};
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
u64 thislat;
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
*/
thislat = rwb_sync_issue_lat(rwb);
if (thislat > rwb->cur_win_nsec ||
- (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+ (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
trace_wbt_lat(bdi, thislat);
return LAT_EXCEEDED;
}
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
* waited or still has writes in flights, consider us doing
* just writes as well.
*/
- if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
- wb_recent_wait(rwb) || wbt_inflight(rwb))
+ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+ wbt_inflight(rwb))
return LAT_UNKNOWN_WRITES;
return LAT_UNKNOWN;
}
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
/*
* If the 'min' latency exceeds our target, step down.
*/
- if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
- trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+ if (stat[READ].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(bdi, stat[READ].min);
trace_wbt_stat(bdi, stat);
return LAT_EXCEEDED;
}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_OK;
}
-static int latency_exceeded(struct rq_wb *rwb)
-{
- struct blk_rq_stat stat[2];
-
- blk_queue_stat_get(rwb->queue, stat);
- return __latency_exceeded(rwb, stat);
-}
-
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
rwb->scale_step--;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
rwb->scaled_max = calc_wb_limits(rwb);
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
rwb->scaled_max = false;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
calc_wb_limits(rwb);
rwb_trace_step(rwb, "step down");
}
static void rwb_arm_timer(struct rq_wb *rwb)
{
- unsigned long expires;
-
if (rwb->scale_step > 0) {
/*
* We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
rwb->cur_win_nsec = rwb->win_nsec;
}
- expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
- mod_timer(&rwb->window_timer, expires);
+ blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
}
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
{
- struct rq_wb *rwb = (struct rq_wb *) data;
+ struct rq_wb *rwb = cb->data;
unsigned int inflight = wbt_inflight(rwb);
int status;
- status = latency_exceeded(rwb);
+ status = latency_exceeded(rwb, cb->stat);
trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
__wbt_wait(rwb, bio->bi_opf, lock);
- if (!timer_pending(&rwb->window_timer))
+ if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
if (current_is_kswapd())
@@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q)
struct rq_wb *rwb = q->rq_wb;
if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
- del_timer_sync(&rwb->window_timer);
+ blk_stat_remove_callback(q, rwb->cb);
rwb->win_nsec = rwb->min_lat_nsec = 0;
wbt_update_limits(rwb);
}
@@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q)
struct rq_wb *rwb;
int i;
- /*
- * For now, we depend on the stats window being larger than
- * our monitoring window. Ensure that this isn't inadvertently
- * violated.
- */
- BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
return -ENOMEM;
+ rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb);
+ if (!rwb->cb) {
+ kfree(rwb);
+ return -ENOMEM;
+ }
+
for (i = 0; i < WBT_NUM_RWQ; i++) {
atomic_set(&rwb->rq_wait[i].inflight, 0);
init_waitqueue_head(&rwb->rq_wait[i].wait);
}
- setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
rwb->wc = 1;
rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q)
wbt_update_limits(rwb);
/*
- * Assign rwb, and turn on stats tracking for this queue
+ * Assign rwb and add the stats callback.
*/
q->rq_wb = rwb;
- blk_stat_enable(q);
+ blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
@@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q)
struct rq_wb *rwb = q->rq_wb;
if (rwb) {
- del_timer_sync(&rwb->window_timer);
+ blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_free_callback(rwb->cb);
q->rq_wb = NULL;
kfree(rwb);
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..ad6c78507c3a 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
static inline void wbt_clear_state(struct blk_issue_stat *stat)
{
- stat->time &= BLK_STAT_TIME_MASK;
+ stat->stat &= ~BLK_STAT_RES_MASK;
}
static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
{
- return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+ return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
}
static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
{
- stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+ stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
}
static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
}
static inline bool wbt_is_read(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
}
struct rq_wait {
@@ -81,7 +81,7 @@ struct rq_wb {
u64 win_nsec; /* default window size */
u64 cur_win_nsec; /* current window size */
- struct timer_list window_timer;
+ struct blk_stat_callback *cb;
s64 sync_issue;
void *sync_cookie;
diff --git a/block/blk.h b/block/blk.h
index d1ea4bd9b9a3..07d375183f31 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -319,10 +319,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_register_queue(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
#endif /* CONFIG_BLK_DEV_THROTTLING */
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
+extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
+#endif
#endif /* BLK_INTERNAL_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 440b95ee593c..da69b079725f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
uint64_t serial_nr;
- bool nonroot_cg;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
- nonroot_cg = bio_blkcg(bio) != &blkcg_root;
rcu_read_unlock();
/*
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
* spuriously on a newly created cic but there's no harm.
*/
if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
- return nonroot_cg;
+ return;
/*
* Drop reference to queues. New queues will be assigned in new
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
}
cic->blkcg_serial_nr = serial_nr;
- return nonroot_cg;
}
#else
-static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
- return false;
}
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- bool disable_wbt;
spin_lock_irq(q->queue_lock);
check_ioprio_changed(cic, bio);
- disable_wbt = check_blkcg_changed(cic, bio);
+ check_blkcg_changed(cic, bio);
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4491,9 +4486,6 @@ new_queue:
rq->elv.priv[1] = cfqq->cfqg;
spin_unlock_irq(q->queue_lock);
- if (disable_wbt)
- wbt_disable_default(q);
-
return 0;
}
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q)
*/
if (blk_queue_nonrot(q))
cfqd->cfq_slice_idle = 0;
+ wbt_disable_default(q);
}
/*
diff --git a/block/genhd.c b/block/genhd.c
index a9c516a8b37d..510aac1486cb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1352,7 +1352,7 @@ struct kobject *get_disk(struct gendisk *disk)
owner = disk->fops->owner;
if (owner && !try_module_get(owner))
return NULL;
- kobj = kobject_get(&disk_to_dev(disk)->kobj);
+ kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
if (kobj == NULL) {
module_put(owner);
return NULL;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27a..82a43bb19967 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_free_cdb;
bio = rq->bio;
- rq->retries = 0;
+ req->retries = 0;
start_time = jiffies;
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
goto error;
/* default. possible overriden later */
- rq->retries = 5;
+ req->retries = 5;
switch (opcode) {
case SEND_DIAGNOSTIC:
case FORMAT_UNIT:
rq->timeout = FORMAT_UNIT_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
case START_STOP:
rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
break;
case READ_DEFECT_DATA:
rq->timeout = READ_DEFECT_DATA_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
default:
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 14035f826b5e..6736c7873d4a 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1831,7 +1831,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
/* 0x08 is Manufacured Inactive */
/* 0x09 is Manufactured */
if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
- pr_err("Couldn't determine the status of the Lifcycle state\n");
+ pr_err("Couldn't determine the status of the Lifecycle state\n");
return -ENODEV;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
return t10_pi_verify(iter, t10_pi_ip_fn, 3);
}
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
.name = "T10-DIF-TYPE1-CRC",
.generate_fn = t10_pi_type1_generate_crc,
.verify_fn = t10_pi_type1_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type1_crc);
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
.name = "T10-DIF-TYPE1-IP",
.generate_fn = t10_pi_type1_generate_ip,
.verify_fn = t10_pi_type1_verify_ip,
};
EXPORT_SYMBOL(t10_pi_type1_ip);
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
.name = "T10-DIF-TYPE3-CRC",
.generate_fn = t10_pi_type3_generate_crc,
.verify_fn = t10_pi_type3_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type3_crc);
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
.name = "T10-DIF-TYPE3-IP",
.generate_fn = t10_pi_type3_generate_ip,
.verify_fn = t10_pi_type3_verify_ip,
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f744de7a0f9b..a1c2e816128f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -514,18 +514,6 @@ config VIRTIO_BLK_SCSI
virtio protocol and not enabled by default by any hypervisor.
Your probably want to virtio-scsi instead.
-config BLK_DEV_HD
- bool "Very old hard disk (MFM/RLL/IDE) driver"
- depends on HAVE_IDE
- depends on !ARM || ARCH_RPC || BROKEN
- help
- This is a very old hard disk driver that lacks the enhanced
- functionality of the newer ones.
-
- It is required for systems with ancient MFM/RLL/ESDI drives.
-
- If unsure, say N.
-
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
depends on INET && BLOCK
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e26f29..b12c772bbeb3 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
-obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 45b4384f650c..ce102ec47ef2 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4207,9 +4207,7 @@ static int __init do_floppy_init(void)
disks[drive]->fops = &floppy_fops;
sprintf(disks[drive]->disk_name, "fd%d", drive);
- init_timer(&motor_off_timer[drive]);
- motor_off_timer[drive].data = drive;
- motor_off_timer[drive].function = motor_off_callback;
+ setup_timer(&motor_off_timer[drive], motor_off_callback, drive);
}
err = register_blkdev(FLOPPY_MAJOR, "fd");
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
deleted file mode 100644
index 6043648da1e8..000000000000
--- a/drivers/block/hd.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This is the low-level hd interrupt support. It traverses the
- * request-list, using interrupts to jump between functions. As
- * all the functions are called within interrupts, we may not
- * sleep. Special care is recommended.
- *
- * modified by Drew Eckhardt to check nr of hd's from the CMOS.
- *
- * Thanks to Branko Lankester, [email protected], who found a bug
- * in the early extended-partition checks and added DM partitions
- *
- * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
- * and general streamlining by Mark Lord.
- *
- * Removed 99% of above. Use Mark's ide driver for those options.
- * This is now a lightweight ST-506 driver. (Paul Gortmaker)
- *
- * Modified 1995 Russell King for ARM processor.
- *
- * Bugfix: max_sectors must be <= 255 or the wheels tend to come
- * off in a hurry once you queue things up - Paul G. 02/2001
- */
-
-/* Uncomment the following if you want verbose error reports. */
-/* #define VERBOSE_ERRORS */
-
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/genhd.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/blkpg.h>
-#include <linux/ata.h>
-#include <linux/hdreg.h>
-
-#define HD_IRQ 14
-
-#define REALLY_SLOW_IO
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#ifdef __arm__
-#undef HD_IRQ
-#endif
-#include <asm/irq.h>
-#ifdef __arm__
-#define HD_IRQ IRQ_HARDDISK
-#endif
-
-/* Hd controller regster ports */
-
-#define HD_DATA 0x1f0 /* _CTL when writing */
-#define HD_ERROR 0x1f1 /* see err-bits */
-#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */
-#define HD_SECTOR 0x1f3 /* starting sector */
-#define HD_LCYL 0x1f4 /* starting cylinder */
-#define HD_HCYL 0x1f5 /* high byte of starting cyl */
-#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS 0x1f7 /* see status-bits */
-#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */
-#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */
-#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */
-
-#define HD_CMD 0x3f6 /* used for resets */
-#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT 0x01
-#define INDEX_STAT 0x02
-#define ECC_STAT 0x04 /* Corrected error */
-#define DRQ_STAT 0x08
-#define SEEK_STAT 0x10
-#define SERVICE_STAT SEEK_STAT
-#define WRERR_STAT 0x20
-#define READY_STAT 0x40
-#define BUSY_STAT 0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR 0x01 /* Bad address mark */
-#define TRK0_ERR 0x02 /* couldn't find track 0 */
-#define ABRT_ERR 0x04 /* Command aborted */
-#define MCR_ERR 0x08 /* media change request */
-#define ID_ERR 0x10 /* ID field not found */
-#define MC_ERR 0x20 /* media changed */
-#define ECC_ERR 0x40 /* Uncorrectable ECC error */
-#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */
-#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */
-
-static DEFINE_SPINLOCK(hd_lock);
-static struct request_queue *hd_queue;
-static struct request *hd_req;
-
-#define TIMEOUT_VALUE (6*HZ)
-#define HD_DELAY 0
-
-#define MAX_ERRORS 16 /* Max read/write errors/sector */
-#define RESET_FREQ 8 /* Reset controller every 8th retry */
-#define RECAL_FREQ 4 /* Recalibrate every 4th retry */
-#define MAX_HD 2
-
-#define STAT_OK (READY_STAT|SEEK_STAT)
-#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK)
-
-static void recal_intr(void);
-static void bad_rw_intr(void);
-
-static int reset;
-static int hd_error;
-
-/*
- * This struct defines the HD's and their types.
- */
-struct hd_i_struct {
- unsigned int head, sect, cyl, wpcom, lzone, ctl;
- int unit;
- int recalibrate;
- int special_op;
-};
-
-#ifdef HD_TYPE
-static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ARRAY_SIZE(hd_info);
-#else
-static struct hd_i_struct hd_info[MAX_HD];
-static int NR_HD;
-#endif
-
-static struct gendisk *hd_gendisk[MAX_HD];
-
-static struct timer_list device_timer;
-
-#define TIMEOUT_VALUE (6*HZ)
-
-#define SET_TIMER \
- do { \
- mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \
- } while (0)
-
-static void (*do_hd)(void) = NULL;
-#define SET_HANDLER(x) \
-if ((do_hd = (x)) != NULL) \
- SET_TIMER; \
-else \
- del_timer(&device_timer);
-
-
-#if (HD_DELAY > 0)
-
-#include <linux/i8253.h>
-
-unsigned long last_req;
-
-unsigned long read_timer(void)
-{
- unsigned long t, flags;
- int i;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- t = jiffies * 11932;
- outb_p(0, 0x43);
- i = inb_p(0x40);
- i |= inb(0x40) << 8;
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
- return(t - i);
-}
-#endif
-
-static void __init hd_setup(char *str, int *ints)
-{
- int hdind = 0;
-
- if (ints[0] != 3)
- return;
- if (hd_info[0].head != 0)
- hdind = 1;
- hd_info[hdind].head = ints[2];
- hd_info[hdind].sect = ints[3];
- hd_info[hdind].cyl = ints[1];
- hd_info[hdind].wpcom = 0;
- hd_info[hdind].lzone = ints[1];
- hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
- NR_HD = hdind+1;
-}
-
-static bool hd_end_request(int err, unsigned int bytes)
-{
- if (__blk_end_request(hd_req, err, bytes))
- return true;
- hd_req = NULL;
- return false;
-}
-
-static bool hd_end_request_cur(int err)
-{
- return hd_end_request(err, blk_rq_cur_bytes(hd_req));
-}
-
-static void dump_status(const char *msg, unsigned int stat)
-{
- char *name = "hd?";
- if (hd_req)
- name = hd_req->rq_disk->disk_name;
-
-#ifdef VERBOSE_ERRORS
- printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
- if (stat & BUSY_STAT) printk("Busy ");
- if (stat & READY_STAT) printk("DriveReady ");
- if (stat & WRERR_STAT) printk("WriteFault ");
- if (stat & SEEK_STAT) printk("SeekComplete ");
- if (stat & DRQ_STAT) printk("DataRequest ");
- if (stat & ECC_STAT) printk("CorrectedError ");
- if (stat & INDEX_STAT) printk("Index ");
- if (stat & ERR_STAT) printk("Error ");
- printk("}\n");
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff);
- if (hd_error & BBD_ERR) printk("BadSector ");
- if (hd_error & ECC_ERR) printk("UncorrectableError ");
- if (hd_error & ID_ERR) printk("SectorIdNotFound ");
- if (hd_error & ABRT_ERR) printk("DriveStatusError ");
- if (hd_error & TRK0_ERR) printk("TrackZeroNotFound ");
- if (hd_error & MARK_ERR) printk("AddrMarkNotFound ");
- printk("}");
- if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
- printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
- inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
- if (hd_req)
- printk(", sector=%ld", blk_rq_pos(hd_req));
- }
- printk("\n");
- }
-#else
- printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff);
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff);
- }
-#endif
-}
-
-static void check_status(void)
-{
- int i = inb_p(HD_STATUS);
-
- if (!OK_STATUS(i)) {
- dump_status("check_status", i);
- bad_rw_intr();
- }
-}
-
-static int controller_busy(void)
-{
- int retries = 100000;
- unsigned char status;
-
- do {
- status = inb_p(HD_STATUS);
- } while ((status & BUSY_STAT) && --retries);
- return status;
-}
-
-static int status_ok(void)
-{
- unsigned char status = inb_p(HD_STATUS);
-
- if (status & BUSY_STAT)
- return 1; /* Ancient, but does it make sense??? */
- if (status & WRERR_STAT)
- return 0;
- if (!(status & READY_STAT))
- return 0;
- if (!(status & SEEK_STAT))
- return 0;
- return 1;
-}
-
-static int controller_ready(unsigned int drive, unsigned int head)
-{
- int retry = 100;
-
- do {
- if (controller_busy() & BUSY_STAT)
- return 0;
- outb_p(0xA0 | (drive<<4) | head, HD_CURRENT);
- if (status_ok())
- return 1;
- } while (--retry);
- return 0;
-}
-
-static void hd_out(struct hd_i_struct *disk,
- unsigned int nsect,
- unsigned int sect,
- unsigned int head,
- unsigned int cyl,
- unsigned int cmd,
- void (*intr_addr)(void))
-{
- unsigned short port;
-
-#if (HD_DELAY > 0)
- while (read_timer() - last_req < HD_DELAY)
- /* nothing */;
-#endif
- if (reset)
- return;
- if (!controller_ready(disk->unit, head)) {
- reset = 1;
- return;
- }
- SET_HANDLER(intr_addr);
- outb_p(disk->ctl, HD_CMD);
- port = HD_DATA;
- outb_p(disk->wpcom >> 2, ++port);
- outb_p(nsect, ++port);
- outb_p(sect, ++port);
- outb_p(cyl, ++port);
- outb_p(cyl >> 8, ++port);
- outb_p(0xA0 | (disk->unit << 4) | head, ++port);
- outb_p(cmd, ++port);
-}
-
-static void hd_request (void);
-
-static int drive_busy(void)
-{
- unsigned int i;
- unsigned char c;
-
- for (i = 0; i < 500000 ; i++) {
- c = inb_p(HD_STATUS);
- if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK)
- return 0;
- }
- dump_status("reset timed out", c);
- return 1;
-}
-
-static void reset_controller(void)
-{
- int i;
-
- outb_p(4, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- outb_p(hd_info[0].ctl & 0x0f, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- if (drive_busy())
- printk("hd: controller still busy\n");
- else if ((hd_error = inb(HD_ERROR)) != 1)
- printk("hd: controller reset failed: %02x\n", hd_error);
-}
-
-static void reset_hd(void)
-{
- static int i;
-
-repeat:
- if (reset) {
- reset = 0;
- i = -1;
- reset_controller();
- } else {
- check_status();
- if (reset)
- goto repeat;
- }
- if (++i < NR_HD) {
- struct hd_i_struct *disk = &hd_info[i];
- disk->special_op = disk->recalibrate = 1;
- hd_out(disk, disk->sect, disk->sect, disk->head-1,
- disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
- if (reset)
- goto repeat;
- } else
- hd_request();
-}
-
-/*
- * Ok, don't know what to do with the unexpected interrupts: on some machines
- * doing a reset and a retry seems to result in an eternal loop. Right now I
- * ignore it, and just set the timeout.
- *
- * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the
- * drive enters "idle", "standby", or "sleep" mode, so if the status looks
- * "good", we just ignore the interrupt completely.
- */
-static void unexpected_hd_interrupt(void)
-{
- unsigned int stat = inb_p(HD_STATUS);
-
- if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) {
- dump_status("unexpected interrupt", stat);
- SET_TIMER;
- }
-}
-
-/*
- * bad_rw_intr() now tries to be a bit smarter and does things
- * according to the error returned by the controller.
- * -Mika Liljeberg ([email protected])
- */
-static void bad_rw_intr(void)
-{
- struct request *req = hd_req;
-
- if (req != NULL) {
- struct hd_i_struct *disk = req->rq_disk->private_data;
- if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
- hd_end_request_cur(-EIO);
- disk->special_op = disk->recalibrate = 1;
- } else if (req->errors % RESET_FREQ == 0)
- reset = 1;
- else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0)
- disk->special_op = disk->recalibrate = 1;
- /* Otherwise just retry */
- }
-}
-
-static inline int wait_DRQ(void)
-{
- int retries;
- int stat;
-
- for (retries = 0; retries < 100000; retries++) {
- stat = inb_p(HD_STATUS);
- if (stat & DRQ_STAT)
- return 0;
- }
- dump_status("wait_DRQ", stat);
- return -1;
-}
-
-static void read_intr(void)
-{
- struct request *req;
- int i, retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if (i & DRQ_STAT)
- goto ok_to_read;
- } while (--retries > 0);
- dump_status("read_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_read:
- req = hd_req;
- insw(HD_DATA, bio_data(req->bio), 256);
-#ifdef DEBUG
- printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
- req->rq_disk->disk_name, blk_rq_pos(req) + 1,
- blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
-#endif
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&read_intr);
- return;
- }
-
- (void) inb_p(HD_STATUS);
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void write_intr(void)
-{
- struct request *req = hd_req;
- int i;
- int retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
- goto ok_to_write;
- } while (--retries > 0);
- dump_status("write_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_write:
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&write_intr);
- outsw(HD_DATA, bio_data(req->bio), 256);
- return;
- }
-
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void recal_intr(void)
-{
- check_status();
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-/*
- * This is another of the error-routines I don't know what to do with. The
- * best idea seems to just set reset, and start all over again.
- */
-static void hd_times_out(unsigned long dummy)
-{
- char *name;
-
- do_hd = NULL;
-
- if (!hd_req)
- return;
-
- spin_lock_irq(hd_queue->queue_lock);
- reset = 1;
- name = hd_req->rq_disk->disk_name;
- printk("%s: timeout\n", name);
- if (++hd_req->errors >= MAX_ERRORS) {
-#ifdef DEBUG
- printk("%s: too many errors\n", name);
-#endif
- hd_end_request_cur(-EIO);
- }
- hd_request();
- spin_unlock_irq(hd_queue->queue_lock);
-}
-
-static int do_special_op(struct hd_i_struct *disk, struct request *req)
-{
- if (disk->recalibrate) {
- disk->recalibrate = 0;
- hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
- return reset;
- }
- if (disk->head > 16) {
- printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
- hd_end_request_cur(-EIO);
- }
- disk->special_op = 0;
- return 1;
-}
-
-/*
- * The driver enables interrupts as much as possible. In order to do this,
- * (a) the device-interrupt is disabled before entering hd_request(),
- * and (b) the timeout-interrupt is disabled before the sti().
- *
- * Interrupts are still masked (by default) whenever we are exchanging
- * data/cmds with a drive, because some drives seem to have very poor
- * tolerance for latency during I/O. The IDE driver has support to unmask
- * interrupts for non-broken hardware, so use that driver if required.
- */
-static void hd_request(void)
-{
- unsigned int block, nsect, sec, track, head, cyl;
- struct hd_i_struct *disk;
- struct request *req;
-
- if (do_hd)
- return;
-repeat:
- del_timer(&device_timer);
-
- if (!hd_req) {
- hd_req = blk_fetch_request(hd_queue);
- if (!hd_req) {
- do_hd = NULL;
- return;
- }
- }
- req = hd_req;
-
- if (reset) {
- reset_hd();
- return;
- }
- disk = req->rq_disk->private_data;
- block = blk_rq_pos(req);
- nsect = blk_rq_sectors(req);
- if (block >= get_capacity(req->rq_disk) ||
- ((block+nsect) > get_capacity(req->rq_disk))) {
- printk("%s: bad access: block=%d, count=%d\n",
- req->rq_disk->disk_name, block, nsect);
- hd_end_request_cur(-EIO);
- goto repeat;
- }
-
- if (disk->special_op) {
- if (do_special_op(disk, req))
- goto repeat;
- return;
- }
- sec = block % disk->sect + 1;
- track = block / disk->sect;
- head = track % disk->head;
- cyl = track / disk->head;
-#ifdef DEBUG
- printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
- req->rq_disk->disk_name,
- req_data_dir(req) == READ ? "read" : "writ",
- cyl, head, sec, nsect, bio_data(req->bio));
-#endif
-
- switch (req_op(req)) {
- case REQ_OP_READ:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
- &read_intr);
- if (reset)
- goto repeat;
- break;
- case REQ_OP_WRITE:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
- &write_intr);
- if (reset)
- goto repeat;
- if (wait_DRQ()) {
- bad_rw_intr();
- goto repeat;
- }
- outsw(HD_DATA, bio_data(req->bio), 256);
- break;
- default:
- printk("unknown hd-command\n");
- hd_end_request_cur(-EIO);
- break;
- }
-}
-
-static void do_hd_request(struct request_queue *q)
-{
- hd_request();
-}
-
-static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct hd_i_struct *disk = bdev->bd_disk->private_data;
-
- geo->heads = disk->head;
- geo->sectors = disk->sect;
- geo->cylinders = disk->cyl;
- return 0;
-}
-
-/*
- * Releasing a block device means we sync() it, so that it can safely
- * be forgotten about...
- */
-
-static irqreturn_t hd_interrupt(int irq, void *dev_id)
-{
- void (*handler)(void) = do_hd;
-
- spin_lock(hd_queue->queue_lock);
-
- do_hd = NULL;
- del_timer(&device_timer);
- if (!handler)
- handler = unexpected_hd_interrupt;
- handler();
-
- spin_unlock(hd_queue->queue_lock);
-
- return IRQ_HANDLED;
-}
-
-static const struct block_device_operations hd_fops = {
- .getgeo = hd_getgeo,
-};
-
-static int __init hd_init(void)
-{
- int drive;
-
- if (register_blkdev(HD_MAJOR, "hd"))
- return -1;
-
- hd_queue = blk_init_queue(do_hd_request, &hd_lock);
- if (!hd_queue) {
- unregister_blkdev(HD_MAJOR, "hd");
- return -ENOMEM;
- }
-
- blk_queue_max_hw_sectors(hd_queue, 255);
- init_timer(&device_timer);
- device_timer.function = hd_times_out;
- blk_queue_logical_block_size(hd_queue, 512);
-
- if (!NR_HD) {
- /*
- * We don't know anything about the drive. This means
- * that you *MUST* specify the drive parameters to the
- * kernel yourself.
- *
- * If we were on an i386, we used to read this info from
- * the BIOS or CMOS. This doesn't work all that well,
- * since this assumes that this is a primary or secondary
- * drive, and if we're using this legacy driver, it's
- * probably an auxiliary controller added to recover
- * legacy data off an ST-506 drive. Either way, it's
- * definitely safest to have the user explicitly specify
- * the information.
- */
- printk("hd: no drives specified - use hd=cyl,head,sectors"
- " on kernel command line\n");
- goto out;
- }
-
- for (drive = 0 ; drive < NR_HD ; drive++) {
- struct gendisk *disk = alloc_disk(64);
- struct hd_i_struct *p = &hd_info[drive];
- if (!disk)
- goto Enomem;
- disk->major = HD_MAJOR;
- disk->first_minor = drive << 6;
- disk->fops = &hd_fops;
- sprintf(disk->disk_name, "hd%c", 'a'+drive);
- disk->private_data = p;
- set_capacity(disk, p->head * p->sect * p->cyl);
- disk->queue = hd_queue;
- p->unit = drive;
- hd_gendisk[drive] = disk;
- printk("%s: %luMB, CHS=%d/%d/%d\n",
- disk->disk_name, (unsigned long)get_capacity(disk)/2048,
- p->cyl, p->head, p->sect);
- }
-
- if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) {
- printk("hd: unable to get IRQ%d for the hard disk driver\n",
- HD_IRQ);
- goto out1;
- }
- if (!request_region(HD_DATA, 8, "hd")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA);
- goto out2;
- }
- if (!request_region(HD_CMD, 1, "hd(cmd)")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD);
- goto out3;
- }
-
- /* Let them fly */
- for (drive = 0; drive < NR_HD; drive++)
- add_disk(hd_gendisk[drive]);
-
- return 0;
-
-out3:
- release_region(HD_DATA, 8);
-out2:
- free_irq(HD_IRQ, NULL);
-out1:
- for (drive = 0; drive < NR_HD; drive++)
- put_disk(hd_gendisk[drive]);
- NR_HD = 0;
-out:
- del_timer(&device_timer);
- unregister_blkdev(HD_MAJOR, "hd");
- blk_cleanup_queue(hd_queue);
- return -1;
-Enomem:
- while (drive--)
- put_disk(hd_gendisk[drive]);
- goto out;
-}
-
-static int __init parse_hd_setup(char *line)
-{
- int ints[6];
-
- (void) get_options(line, ARRAY_SIZE(ints), ints);
- hd_setup(NULL, ints);
-
- return 1;
-}
-__setup("hd=", parse_hd_setup);
-
-late_initcall(hd_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0ecb6461ed81..cc981f34e017 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1710,7 +1710,7 @@ static int loop_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops loop_mq_ops = {
+static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
.init_request = loop_init_request,
};
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 286f276f586e..e88e7b06c616 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -989,9 +989,7 @@ static int mg_probe(struct platform_device *plat_dev)
blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
- init_timer(&host->timer);
- host->timer.function = mg_times_out;
- host->timer.data = (unsigned long)host;
+ setup_timer(&host->timer, mg_times_out, (unsigned long)host);
host->gd = alloc_disk(MG_DISK_MAX_PART);
if (!host->gd) {
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index f96ab717534c..30076e7753bc 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3889,7 +3889,7 @@ exit_handler:
return BLK_EH_RESET_TIMER;
}
-static struct blk_mq_ops mtip_mq_ops = {
+static const struct blk_mq_ops mtip_mq_ops = {
.queue_rq = mtip_queue_rq,
.init_request = mtip_init_cmd,
.exit_request = mtip_free_cmd,
@@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd)
dev_info(&dd->pdev->dev, "device %s surprise removal\n",
dd->disk->disk_name);
- blk_mq_freeze_queue_start(dd->queue);
+ blk_freeze_queue_start(dd->queue);
blk_mq_stop_hw_queues(dd->queue);
blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb..03ae72985c79 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1035,7 +1035,7 @@ static int nbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops nbd_mq_ops = {
+static const struct blk_mq_ops nbd_mq_ops = {
.queue_rq = nbd_queue_rq,
.init_request = nbd_init_request,
.timeout = nbd_xmit_timeout,
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 6f2e565bccc5..f93906ff31e8 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -117,6 +117,10 @@ static bool use_lightnvm;
module_param(use_lightnvm, bool, S_IRUGO);
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
+static bool blocking;
+module_param(blocking, bool, S_IRUGO);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
static int irqmode = NULL_IRQ_SOFTIRQ;
static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -357,6 +361,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
if (irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
@@ -392,7 +398,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops null_mq_ops = {
+static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.init_hctx = null_init_hctx,
.complete = null_softirq_done_fn,
@@ -724,6 +730,9 @@ static int null_add_dev(void)
nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
nullb->tag_set.driver_data = nullb;
+ if (blocking)
+ nullb->tag_set.flags |= BLK_MQ_F_BLOCKING;
+
rv = blk_mq_alloc_tag_set(&nullb->tag_set);
if (rv)
goto out_cleanup_queues;
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 939641d6e262..b1267ef34d5a 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -300,6 +300,11 @@ static void pcd_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pcd_request, &pcd_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ continue;
+ }
cd->disk = disk;
cd->pi = &cd->pia;
cd->present = 0;
@@ -735,18 +740,36 @@ static int pcd_detect(void)
}
/* I/O request processing */
-static struct request_queue *pcd_queue;
+static int pcd_queue;
+
+static int set_next_request(void)
+{
+ struct pcd_unit *cd;
+ struct request_queue *q;
+ int old_pos = pcd_queue;
+
+ do {
+ cd = &pcd[pcd_queue];
+ q = cd->present ? cd->disk->queue : NULL;
+ if (++pcd_queue == PCD_UNITS)
+ pcd_queue = 0;
+ if (q) {
+ pcd_req = blk_fetch_request(q);
+ if (pcd_req)
+ break;
+ }
+ } while (pcd_queue != old_pos);
+
+ return pcd_req != NULL;
+}
-static void do_pcd_request(struct request_queue * q)
+static void pcd_request(void)
{
if (pcd_busy)
return;
while (1) {
- if (!pcd_req) {
- pcd_req = blk_fetch_request(q);
- if (!pcd_req)
- return;
- }
+ if (!pcd_req && !set_next_request())
+ return;
if (rq_data_dir(pcd_req) == READ) {
struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -766,6 +789,11 @@ static void do_pcd_request(struct request_queue * q)
}
}
+static void do_pcd_request(struct request_queue *q)
+{
+ pcd_request();
+}
+
static inline void next_request(int err)
{
unsigned long saved_flags;
@@ -774,7 +802,7 @@ static inline void next_request(int err)
if (!__blk_end_request_cur(pcd_req, err))
pcd_req = NULL;
pcd_busy = 0;
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -849,7 +877,7 @@ static void do_pcd_read_drq(void)
do_pcd_read();
spin_lock_irqsave(&pcd_lock, saved_flags);
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -957,19 +985,10 @@ static int __init pcd_init(void)
return -EBUSY;
}
- pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
- if (!pcd_queue) {
- unregister_blkdev(major, name);
- for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
- put_disk(cd->disk);
- return -ENOMEM;
- }
-
for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
if (cd->present) {
register_cdrom(&cd->info);
cd->disk->private_data = cd;
- cd->disk->queue = pcd_queue;
add_disk(cd->disk);
}
}
@@ -988,9 +1007,9 @@ static void __exit pcd_exit(void)
pi_release(cd->pi);
unregister_cdrom(&cd->info);
}
+ blk_cleanup_queue(cd->disk->queue);
put_disk(cd->disk);
}
- blk_cleanup_queue(pcd_queue);
unregister_blkdev(major, name);
pi_unregister_driver(par_drv);
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9cfd2e06a649..b05e151c9b38 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -381,12 +381,33 @@ static enum action do_pd_write_start(void);
static enum action do_pd_read_drq(void);
static enum action do_pd_write_done(void);
-static struct request_queue *pd_queue;
+static int pd_queue;
static int pd_claimed;
static struct pd_unit *pd_current; /* current request's drive */
static PIA *pi_current; /* current request's PIA */
+static int set_next_request(void)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int old_pos = pd_queue;
+
+ do {
+ disk = pd[pd_queue].gd;
+ q = disk ? disk->queue : NULL;
+ if (++pd_queue == PD_UNITS)
+ pd_queue = 0;
+ if (q) {
+ pd_req = blk_fetch_request(q);
+ if (pd_req)
+ break;
+ }
+ } while (pd_queue != old_pos);
+
+ return pd_req != NULL;
+}
+
static void run_fsm(void)
{
while (1) {
@@ -418,8 +439,7 @@ static void run_fsm(void)
spin_lock_irqsave(&pd_lock, saved_flags);
if (!__blk_end_request_cur(pd_req,
res == Ok ? 0 : -EIO)) {
- pd_req = blk_fetch_request(pd_queue);
- if (!pd_req)
+ if (!set_next_request())
stop = 1;
}
spin_unlock_irqrestore(&pd_lock, saved_flags);
@@ -839,7 +859,13 @@ static void pd_probe_drive(struct pd_unit *disk)
p->first_minor = (disk - pd) << PD_BITS;
disk->gd = p;
p->private_data = disk;
- p->queue = pd_queue;
+ p->queue = blk_init_queue(do_pd_request, &pd_lock);
+ if (!p->queue) {
+ disk->gd = NULL;
+ put_disk(p);
+ return;
+ }
+ blk_queue_max_hw_sectors(p->queue, cluster);
if (disk->drive == -1) {
for (disk->drive = 0; disk->drive <= 1; disk->drive++)
@@ -919,26 +945,18 @@ static int __init pd_init(void)
if (disable)
goto out1;
- pd_queue = blk_init_queue(do_pd_request, &pd_lock);
- if (!pd_queue)
- goto out1;
-
- blk_queue_max_hw_sectors(pd_queue, cluster);
-
if (register_blkdev(major, name))
- goto out2;
+ goto out1;
printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
name, name, PD_VERSION, major, cluster, nice);
if (!pd_detect())
- goto out3;
+ goto out2;
return 0;
-out3:
- unregister_blkdev(major, name);
out2:
- blk_cleanup_queue(pd_queue);
+ unregister_blkdev(major, name);
out1:
return -ENODEV;
}
@@ -953,11 +971,11 @@ static void __exit pd_exit(void)
if (p) {
disk->gd = NULL;
del_gendisk(p);
+ blk_cleanup_queue(p->queue);
put_disk(p);
pi_release(disk->pi);
}
}
- blk_cleanup_queue(pd_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 14c5d32f5d8b..f24ca7315ddc 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -287,6 +287,12 @@ static void __init pf_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pf_request, &pf_spin_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ return;
+ }
+ blk_queue_max_segments(disk->queue, cluster);
pf->disk = disk;
pf->pi = &pf->pia;
pf->media_status = PF_NM;
@@ -772,7 +778,28 @@ static int pf_ready(void)
return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
}
-static struct request_queue *pf_queue;
+static int pf_queue;
+
+static int set_next_request(void)
+{
+ struct pf_unit *pf;
+ struct request_queue *q;
+ int old_pos = pf_queue;
+
+ do {
+ pf = &units[pf_queue];
+ q = pf->present ? pf->disk->queue : NULL;
+ if (++pf_queue == PF_UNITS)
+ pf_queue = 0;
+ if (q) {
+ pf_req = blk_fetch_request(q);
+ if (pf_req)
+ break;
+ }
+ } while (pf_queue != old_pos);
+
+ return pf_req != NULL;
+}
static void pf_end_request(int err)
{
@@ -780,16 +807,13 @@ static void pf_end_request(int err)
pf_req = NULL;
}
-static void do_pf_request(struct request_queue * q)
+static void pf_request(void)
{
if (pf_busy)
return;
repeat:
- if (!pf_req) {
- pf_req = blk_fetch_request(q);
- if (!pf_req)
- return;
- }
+ if (!pf_req && !set_next_request())
+ return;
pf_current = pf_req->rq_disk->private_data;
pf_block = blk_rq_pos(pf_req);
@@ -817,6 +841,11 @@ repeat:
}
}
+static void do_pf_request(struct request_queue *q)
+{
+ pf_request();
+}
+
static int pf_next_buf(void)
{
unsigned long saved_flags;
@@ -846,7 +875,7 @@ static inline void next_request(int err)
spin_lock_irqsave(&pf_spin_lock, saved_flags);
pf_end_request(err);
pf_busy = 0;
- do_pf_request(pf_queue);
+ pf_request();
spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
}
@@ -972,15 +1001,6 @@ static int __init pf_init(void)
put_disk(pf->disk);
return -EBUSY;
}
- pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
- if (!pf_queue) {
- unregister_blkdev(major, name);
- for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
- put_disk(pf->disk);
- return -ENOMEM;
- }
-
- blk_queue_max_segments(pf_queue, cluster);
for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
struct gendisk *disk = pf->disk;
@@ -988,7 +1008,6 @@ static int __init pf_init(void)
if (!pf->present)
continue;
disk->private_data = pf;
- disk->queue = pf_queue;
add_disk(disk);
}
return 0;
@@ -1003,10 +1022,10 @@ static void __exit pf_exit(void)
if (!pf->present)
continue;
del_gendisk(pf->disk);
+ blk_cleanup_queue(pf->disk->queue);
put_disk(pf->disk);
pi_release(pf->pi);
}
- blk_cleanup_queue(pf_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b65964..f24ade333e0c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4317,7 +4317,7 @@ static int rbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops rbd_mq_ops = {
+static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
.init_request = rbd_init_request,
};
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b5afd495d482..3064be6cf375 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -211,7 +211,7 @@ enum head {
struct swim_priv {
struct swim __iomem *base;
spinlock_t lock;
- struct request_queue *queue;
+ int fdc_queue;
int floppy_count;
struct floppy_state unit[FD_MAX_UNIT];
};
@@ -525,12 +525,33 @@ static int floppy_read_sectors(struct floppy_state *fs,
return 0;
}
-static void redo_fd_request(struct request_queue *q)
+static struct request *swim_next_request(struct swim_priv *swd)
{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = swd->fdc_queue;
+
+ do {
+ q = swd->unit[swd->fdc_queue].disk->queue;
+ if (++swd->fdc_queue == swd->floppy_count)
+ swd->fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (swd->fdc_queue != old_pos);
+
+ return NULL;
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+ struct swim_priv *swd = q->queuedata;
struct request *req;
struct floppy_state *fs;
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
while (req) {
int err = -EIO;
@@ -554,15 +575,10 @@ static void redo_fd_request(struct request_queue *q)
}
done:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
}
}
-static void do_fd_request(struct request_queue *q)
-{
- redo_fd_request(q);
-}
-
static struct floppy_struct floppy_type[4] = {
{ 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */
{ 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
@@ -833,22 +849,25 @@ static int swim_floppy_init(struct swim_priv *swd)
return -EBUSY;
}
+ spin_lock_init(&swd->lock);
+
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk = alloc_disk(1);
if (swd->unit[drive].disk == NULL) {
err = -ENOMEM;
goto exit_put_disks;
}
+ swd->unit[drive].disk->queue = blk_init_queue(do_fd_request,
+ &swd->lock);
+ if (!swd->unit[drive].disk->queue) {
+ err = -ENOMEM;
+ put_disk(swd->unit[drive].disk);
+ goto exit_put_disks;
+ }
+ swd->unit[drive].disk->queue->queuedata = swd;
swd->unit[drive].swd = swd;
}
- spin_lock_init(&swd->lock);
- swd->queue = blk_init_queue(do_fd_request, &swd->lock);
- if (!swd->queue) {
- err = -ENOMEM;
- goto exit_put_disks;
- }
-
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
swd->unit[drive].disk->major = FLOPPY_MAJOR;
@@ -856,7 +875,6 @@ static int swim_floppy_init(struct swim_priv *swd)
sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
swd->unit[drive].disk->fops = &floppy_fops;
swd->unit[drive].disk->private_data = &swd->unit[drive];
- swd->unit[drive].disk->queue = swd->queue;
set_capacity(swd->unit[drive].disk, 2880);
add_disk(swd->unit[drive].disk);
}
@@ -943,13 +961,12 @@ static int swim_remove(struct platform_device *dev)
for (drive = 0; drive < swd->floppy_count; drive++) {
del_gendisk(swd->unit[drive].disk);
+ blk_cleanup_queue(swd->unit[drive].disk->queue);
put_disk(swd->unit[drive].disk);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
- blk_cleanup_queue(swd->queue);
-
/* eject floppies */
for (drive = 0; drive < swd->floppy_count; drive++)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1d4c9f8bc1e1..2d8290169271 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -597,7 +597,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
}
-static struct blk_mq_ops virtio_mq_ops = {
+static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5067a0a952cb..d137ef8a72be 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -907,7 +907,7 @@ out_busy:
return BLK_MQ_RQ_QUEUE_BUSY;
}
-static struct blk_mq_ops blkfront_mq_ops = {
+static const struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq,
};
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 0b081d170087..d19af1d21f4c 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -762,7 +762,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_OK;
}
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
.queue_rq = dm_mq_queue_rq,
.complete = dm_softirq_done,
.init_request = dm_mq_init_request,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..cd93a3b9ceca 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
- trace_block_bio_complete(md->queue, bio, io_error);
bio->bi_error = io_error;
bio_endio(bio);
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..7aeb9691c2e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
- raid_bi, 0);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index c80869e60909..51f2be8889b5 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -347,7 +347,7 @@ static int ubiblock_init_request(void *data, struct request *req,
return 0;
}
-static struct blk_mq_ops ubiblock_mq_ops = {
+static const struct blk_mq_ops ubiblock_mq_ops = {
.queue_rq = ubiblock_queue_rq,
.init_request = ubiblock_init_request,
};
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9583a5f58a1d..14fbe9d80067 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
@@ -67,6 +66,41 @@ static DEFINE_SPINLOCK(dev_list_lock);
static struct class *nvme_class;
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+ if (blk_noretry_request(req))
+ return false;
+ if (req->errors & NVME_SC_DNR)
+ return false;
+ if (jiffies - req->start_time >= req->timeout)
+ return false;
+ if (nvme_req(req)->retries >= nvme_max_retries)
+ return false;
+ return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+ int error = 0;
+
+ if (unlikely(req->errors)) {
+ if (nvme_req_needs_retry(req)) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req,
+ !blk_mq_queue_stopped(req->q));
+ return;
+ }
+
+ if (blk_rq_is_passthrough(req))
+ error = req->errors;
+ else
+ error = nvme_error_status(req->errors);
+ }
+
+ blk_mq_end_request(req, error);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
@@ -205,12 +239,6 @@ fail:
return NULL;
}
-void nvme_requeue_req(struct request *req)
-{
- blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
@@ -327,6 +355,11 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
{
int ret = BLK_MQ_RQ_QUEUE_OK;
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ nvme_req(req)->retries = 0;
+ req->rq_flags |= RQF_DONTPREP;
+ }
+
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
@@ -2386,7 +2419,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_freeze_queue_start(ns->queue);
+ blk_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5b7386f69f4d..990e6fb32a63 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
}
EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->opts->max_reconnects != -1 &&
+ ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
/**
* nvmf_register_transport() - NVMe Fabrics Library registration function.
* @ops: Transport ops instance to be registered to the
@@ -533,6 +543,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_QUEUE_SIZE, "queue_size=%d" },
{ NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" },
{ NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
+ { NVMF_OPT_CTRL_LOSS_TMO, "ctrl_loss_tmo=%d" },
{ NVMF_OPT_KATO, "keep_alive_tmo=%d" },
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
@@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
+ int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->kato = token;
break;
+ case NVMF_OPT_CTRL_LOSS_TMO:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (token < 0)
+ pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+ ctrl_loss_tmo = token;
+ break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
@@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
}
+ if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
opts->host = nvmf_default_host;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 156018182ce4..f5a9c1fb186f 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -21,6 +21,8 @@
#define NVMF_MAX_QUEUE_SIZE 1024
#define NVMF_DEF_QUEUE_SIZE 128
#define NVMF_DEF_RECONNECT_DELAY 10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO 600
/*
* Define a host as seen by the target. We allocate one at boot, but also
@@ -53,6 +55,7 @@ enum {
NVMF_OPT_HOSTNQN = 1 << 8,
NVMF_OPT_RECONNECT_DELAY = 1 << 9,
NVMF_OPT_HOST_TRADDR = 1 << 10,
+ NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
};
/**
@@ -77,6 +80,10 @@ enum {
* @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
* @kato: Keep-alive timeout.
* @host: Virtual NVMe host, contains the NQN and Host ID.
+ * @nr_reconnects: number of reconnect attempted since the last ctrl failure
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ * the controller, (-1) means reconnect forever, zero means remove
+ * immediately;
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -91,6 +98,8 @@ struct nvmf_ctrl_options {
bool discovery_nqn;
unsigned int kato;
struct nvmf_host *host;
+ int nr_reconnects;
+ int max_reconnects;
};
/*
@@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9690beb15e69..fc42172c796a 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -848,11 +848,12 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
/* validate the ACC response */
if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (assoc_acc->hdr.desc_list_len !=
+ else if (assoc_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(
sizeof(struct fcnvme_ls_cr_assoc_acc)))
fcret = VERR_CR_ASSOC_ACC_LEN;
- if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (assoc_acc->hdr.rqst.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (assoc_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -955,10 +956,10 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
/* validate the ACC response */
if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (conn_acc->hdr.desc_list_len !=
+ else if (conn_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
fcret = VERR_CR_CONN_ACC_LEN;
- if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (conn_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -1146,7 +1147,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_fc_ctrl *ctrl = op->ctrl;
struct nvme_fc_queue *queue = op->queue;
struct nvme_completion *cqe = &op->rsp_iu.cqe;
- u16 status;
+ u16 status = NVME_SC_SUCCESS;
/*
* WARNING:
@@ -1182,8 +1183,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
- else
- status = freq->status;
+ else if (freq->status)
+ status = NVME_SC_FC_TRANSPORT_ERROR;
/*
* For the linux implementation, if we have an unsuccesful
@@ -1211,7 +1212,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
*/
if (freq->transferred_length !=
be32_to_cpu(op->cmd_iu.data_len)) {
- status = -EIO;
+ status = NVME_SC_FC_TRANSPORT_ERROR;
goto done;
}
op->nreq.result.u64 = 0;
@@ -1226,8 +1227,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
(freq->rcv_rsplen / 4) ||
be32_to_cpu(op->rsp_iu.xfrd_len) !=
freq->transferred_length ||
+ op->rsp_iu.status_code ||
op->rqno != le16_to_cpu(cqe->command_id))) {
- status = -EIO;
+ status = NVME_SC_FC_TRANSPORT_ERROR;
goto done;
}
op->nreq.result = cqe->result;
@@ -1235,7 +1237,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
break;
default:
- status = -EIO;
+ status = NVME_SC_FC_TRANSPORT_ERROR;
goto done;
}
@@ -1761,7 +1763,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
op->fcp_req.io_dir = io_dir;
op->fcp_req.transferred_length = 0;
op->fcp_req.rcv_rsplen = 0;
- op->fcp_req.status = 0;
+ op->fcp_req.status = NVME_SC_SUCCESS;
op->fcp_req.sqid = cpu_to_le16(queue->qnum);
/*
@@ -1923,32 +1925,18 @@ nvme_fc_complete_rq(struct request *rq)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
struct nvme_fc_ctrl *ctrl = op->ctrl;
- int error = 0, state;
+ int state;
state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
nvme_cleanup_cmd(rq);
-
nvme_fc_unmap_data(ctrl, rq, op);
-
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
-
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
- }
-
+ nvme_complete_rq(rq);
nvme_fc_ctrl_put(ctrl);
- blk_mq_end_request(rq, error);
}
-static struct blk_mq_ops nvme_fc_mq_ops = {
+static const struct blk_mq_ops nvme_fc_mq_ops = {
.queue_rq = nvme_fc_queue_rq,
.complete = nvme_fc_complete_rq,
.init_request = nvme_fc_init_request,
@@ -1959,7 +1947,7 @@ static struct blk_mq_ops nvme_fc_mq_ops = {
.timeout = nvme_fc_timeout,
};
-static struct blk_mq_ops nvme_fc_admin_mq_ops = {
+static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
.queue_rq = nvme_fc_queue_rq,
.complete = nvme_fc_complete_rq,
.init_request = nvme_fc_init_admin_request,
@@ -2546,11 +2534,20 @@ static struct nvmf_transport_ops nvme_fc_transport = {
static int __init nvme_fc_init_module(void)
{
+ int ret;
+
nvme_fc_wq = create_workqueue("nvme_fc_wq");
if (!nvme_fc_wq)
return -ENOMEM;
- return nvmf_register_transport(&nvme_fc_transport);
+ ret = nvmf_register_transport(&nvme_fc_transport);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ destroy_workqueue(nvme_fc_wq);
+ return ret;
}
static void __exit nvme_fc_exit_module(void)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2aa20e3e5675..9eecb67177df 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -43,8 +43,6 @@ extern unsigned char shutdown_timeout;
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
-extern unsigned int nvme_max_retries;
-
enum {
NVME_NS_LBA = 0,
NVME_NS_LIGHTNVM = 1,
@@ -92,6 +90,7 @@ enum nvme_quirks {
struct nvme_request {
struct nvme_command *cmd;
union nvme_result result;
+ u8 retries;
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -261,13 +260,7 @@ static inline int nvme_error_status(u16 status)
}
}
-static inline bool nvme_req_needs_retry(struct request *req, u16 status)
-{
- return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
- (jiffies - req->start_time) < req->timeout &&
- req->retries < nvme_max_retries;
-}
-
+void nvme_complete_rq(struct request *req);
void nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
@@ -302,7 +295,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid);
-void nvme_requeue_req(struct request *req);
int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 26a5fd05fe88..3818ab15a26e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -326,10 +326,6 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
iod->nents = 0;
iod->length = size;
- if (!(rq->rq_flags & RQF_DONTPREP)) {
- rq->retries = 0;
- rq->rq_flags |= RQF_DONTPREP;
- }
return BLK_MQ_RQ_QUEUE_OK;
}
@@ -628,34 +624,12 @@ out_free_cmd:
return ret;
}
-static void nvme_complete_rq(struct request *req)
+static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dev *dev = iod->nvmeq->dev;
- int error = 0;
-
- nvme_unmap_data(dev, req);
-
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- req->retries++;
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
-
- if (unlikely(iod->aborted)) {
- dev_warn(dev->ctrl.device,
- "completing aborted command with status: %04x\n",
- req->errors);
- }
- blk_mq_end_request(req, error);
+ nvme_unmap_data(iod->nvmeq->dev, req);
+ nvme_complete_rq(req);
}
/* We read the CQE phase first to check if the rest of the entry is valid */
@@ -1129,18 +1103,18 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
return result;
}
-static struct blk_mq_ops nvme_mq_admin_ops = {
+static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
.exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
-static struct blk_mq_ops nvme_mq_ops = {
+static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.map_queues = nvme_pci_map_queues,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 47a479f26e5d..4aae363943e3 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -34,7 +34,7 @@
#include "fabrics.h"
-#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */
+#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */
@@ -118,7 +118,6 @@ struct nvme_rdma_ctrl {
struct nvme_rdma_qe async_event_sqe;
- int reconnect_delay;
struct delayed_work reconnect_work;
struct list_head list;
@@ -129,14 +128,8 @@ struct nvme_rdma_ctrl {
u64 cap;
u32 max_fr_pages;
- union {
- struct sockaddr addr;
- struct sockaddr_in addr_in;
- };
- union {
- struct sockaddr src_addr;
- struct sockaddr_in src_addr_in;
- };
+ struct sockaddr_storage addr;
+ struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
};
@@ -569,11 +562,12 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
return PTR_ERR(queue->cm_id);
}
- queue->cm_error = -ETIMEDOUT;
if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
- src_addr = &ctrl->src_addr;
+ src_addr = (struct sockaddr *)&ctrl->src_addr;
- ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
+ queue->cm_error = -ETIMEDOUT;
+ ret = rdma_resolve_addr(queue->cm_id, src_addr,
+ (struct sockaddr *)&ctrl->addr,
NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
@@ -712,6 +706,26 @@ free_ctrl:
kfree(ctrl);
}
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+{
+ /* If we are resetting/deleting then do nothing */
+ if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
+ WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
+ ctrl->ctrl.state == NVME_CTRL_LIVE);
+ return;
+ }
+
+ if (nvmf_should_reconnect(&ctrl->ctrl)) {
+ dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
+ ctrl->ctrl.opts->reconnect_delay);
+ queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
+ ctrl->ctrl.opts->reconnect_delay * HZ);
+ } else {
+ dev_info(ctrl->ctrl.device, "Removing controller...\n");
+ queue_work(nvme_rdma_wq, &ctrl->delete_work);
+ }
+}
+
static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
@@ -719,6 +733,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
bool changed;
int ret;
+ ++ctrl->ctrl.opts->nr_reconnects;
+
if (ctrl->queue_count > 1) {
nvme_rdma_free_io_queues(ctrl);
@@ -763,6 +779,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
+ ctrl->ctrl.opts->nr_reconnects = 0;
if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
@@ -777,13 +794,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
stop_admin_q:
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
requeue:
- /* Make sure we are not resetting/deleting */
- if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
- dev_info(ctrl->ctrl.device,
- "Failed reconnect attempt, requeueing...\n");
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
- }
+ dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
+ ctrl->ctrl.opts->nr_reconnects);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -810,11 +823,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
- dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
- ctrl->reconnect_delay);
-
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -1509,27 +1518,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_rdma_queue *queue = req->queue;
- int error = 0;
-
- nvme_rdma_unmap_data(queue, rq);
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
-
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
- }
-
- blk_mq_end_request(rq, error);
+ nvme_rdma_unmap_data(req->queue, rq);
+ nvme_complete_rq(rq);
}
-static struct blk_mq_ops nvme_rdma_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_request,
@@ -1540,7 +1534,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
.timeout = nvme_rdma_timeout,
};
-static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_admin_request,
@@ -1857,27 +1851,13 @@ out_free_io_queues:
return ret;
}
-static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
-{
- u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
- size_t buflen = strlen(p);
-
- /* XXX: handle IPv6 addresses */
-
- if (buflen > INET_ADDRSTRLEN)
- return -EINVAL;
- if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
- return -EINVAL;
- in_addr->sin_family = AF_INET;
- return 0;
-}
-
static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_rdma_ctrl *ctrl;
int ret;
bool changed;
+ char *port;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -1885,40 +1865,33 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
+ if (opts->mask & NVMF_OPT_TRSVCID)
+ port = opts->trsvcid;
+ else
+ port = __stringify(NVME_RDMA_IP_PORT);
+
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->traddr, port, &ctrl->addr);
if (ret) {
- pr_err("malformed IP address passed: %s\n", opts->traddr);
+ pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
goto out_free_ctrl;
}
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
- ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
- opts->host_traddr);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->host_traddr, NULL, &ctrl->src_addr);
if (ret) {
- pr_err("malformed src IP address passed: %s\n",
+ pr_err("malformed src address passed: %s\n",
opts->host_traddr);
goto out_free_ctrl;
}
}
- if (opts->mask & NVMF_OPT_TRSVCID) {
- u16 port;
-
- ret = kstrtou16(opts->trsvcid, 0, &port);
- if (ret)
- goto out_free_ctrl;
-
- ctrl->addr_in.sin_port = cpu_to_be16(port);
- } else {
- ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
- }
-
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
if (ret)
goto out_free_ctrl;
- ctrl->reconnect_delay = opts->reconnect_delay;
INIT_DELAYED_WORK(&ctrl->reconnect_work,
nvme_rdma_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
@@ -1977,7 +1950,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
- dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
kref_get(&ctrl->ctrl.kref);
@@ -2013,7 +1986,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma",
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_rdma_create_ctrl,
};
@@ -2055,12 +2028,20 @@ static int __init nvme_rdma_init_module(void)
return -ENOMEM;
ret = ib_register_client(&nvme_rdma_ib_client);
- if (ret) {
- destroy_workqueue(nvme_rdma_wq);
- return ret;
- }
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = nvmf_register_transport(&nvme_rdma_transport);
+ if (ret)
+ goto err_unreg_client;
+
+ return 0;
- return nvmf_register_transport(&nvme_rdma_transport);
+err_unreg_client:
+ ib_unregister_client(&nvme_rdma_ib_client);
+err_destroy_wq:
+ destroy_workqueue(nvme_rdma_wq);
+ return ret;
}
static void __exit nvme_rdma_cleanup_module(void)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 76450b0c55f1..ff1f97006322 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -121,7 +121,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
}
switch (req->cmd->get_log_page.lid) {
- case 0x01:
+ case NVME_LOG_ERROR:
/*
* We currently never set the More bit in the status field,
* so all error log entries are invalid and can be zeroed out.
@@ -129,7 +129,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
* mandatory log page.
*/
break;
- case 0x02:
+ case NVME_LOG_SMART:
/*
* XXX: fill out actual smart log
*
@@ -149,7 +149,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
goto err;
}
break;
- case 0x03:
+ case NVME_LOG_FW_SLOT:
/*
* We only support a single firmware slot which always is
* active, so we can zero out the whole firmware slot log and
@@ -480,31 +480,25 @@ static void nvmet_execute_keep_alive(struct nvmet_req *req)
nvmet_req_complete(req, 0);
}
-int nvmet_parse_admin_cmd(struct nvmet_req *req)
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
req->ns = NULL;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret))
+ return ret;
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
switch (cmd->get_log_page.lid) {
- case 0x01:
- case 0x02:
- case 0x03:
+ case NVME_LOG_ERROR:
+ case NVME_LOG_SMART:
+ case NVME_LOG_FW_SLOT:
req->execute = nvmet_execute_get_log_page;
return 0;
}
@@ -545,6 +539,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
return 0;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 798653b329b2..cf90713043da 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -273,8 +273,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
NULL);
if (IS_ERR(ns->bdev)) {
- pr_err("nvmet: failed to open block device %s: (%ld)\n",
- ns->device_path, PTR_ERR(ns->bdev));
+ pr_err("failed to open block device %s: (%ld)\n",
+ ns->device_path, PTR_ERR(ns->bdev));
ret = PTR_ERR(ns->bdev);
ns->bdev = NULL;
goto out_unlock;
@@ -661,6 +661,23 @@ out:
return status;
}
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
+{
+ if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
+ pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+
+ if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+ pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ req->ns = NULL;
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+ return 0;
+}
+
static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
const char *hostnqn)
{
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index af8aabf05335..1aaf597e81fc 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -159,15 +159,15 @@ out:
nvmet_req_complete(req, status);
}
-int nvmet_parse_discovery_cmd(struct nvmet_req *req)
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got cmd %d while not ready\n",
- cmd->common.opcode);
+ pr_err("got cmd %d while not ready\n",
+ cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
@@ -180,8 +180,8 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_get_disc_log_page;
return 0;
default:
- pr_err("nvmet: unsupported get_log_page lid %d\n",
- cmd->get_log_page.lid);
+ pr_err("unsupported get_log_page lid %d\n",
+ cmd->get_log_page.lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
@@ -192,17 +192,16 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
nvmet_execute_identify_disc_ctrl;
return 0;
default:
- pr_err("nvmet: unsupported identify cns %d\n",
- cmd->identify.cns);
+ pr_err("unsupported identify cns %d\n",
+ cmd->identify.cns);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
default:
- pr_err("nvmet: unsupported cmd %d\n",
- cmd->common.opcode);
+ pr_err("unsupported cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 8bd022af3df6..2a3c15b57f6e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -73,7 +73,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -214,7 +214,7 @@ out_ctrl_put:
goto out;
}
-int nvmet_parse_connect_cmd(struct nvmet_req *req)
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 8f483ee7868c..2c0709f0a7d3 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1189,8 +1189,8 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1281,8 +1281,9 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
(ret == VERR_NO_ASSOC) ?
- ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INV_ASSOC :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1369,8 +1370,12 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- (ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ (ret == VERR_NO_ASSOC) ?
+ FCNVME_RJT_RC_INV_ASSOC :
+ (ret == VERR_NO_CONN) ?
+ FCNVME_RJT_RC_INV_CONN :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1479,7 +1484,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
default:
iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
- ELS_RJT_INVAL, ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
}
nvmet_fc_xmt_ls_rsp(tgtport, iod);
@@ -1619,6 +1624,8 @@ nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
__free_page(sg_page(sg));
kfree(fod->data_sg);
+ fod->data_sg = NULL;
+ fod->data_sg_cnt = 0;
}
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 6b0baa9caab9..a4b455156bb5 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -196,26 +196,19 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
}
}
-int nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret)) {
req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
-
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ return ret;
}
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
- if (!req->ns)
+ if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
switch (cmd->common.opcode) {
@@ -237,7 +230,8 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_write_zeroes;
return 0;
default:
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 22f7bc6bac7f..33b431e4eec3 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -13,12 +13,10 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h>
-#include <linux/delay.h>
#include <linux/blk-mq.h>
#include <linux/nvme.h>
#include <linux/module.h>
#include <linux/parser.h>
-#include <linux/t10-pi.h>
#include "nvmet.h"
#include "../host/nvme.h"
#include "../host/fabrics.h"
@@ -93,31 +91,26 @@ static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
static void nvme_loop_complete_rq(struct request *req)
{
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
- int error = 0;
nvme_cleanup_cmd(req);
sg_free_table_chained(&iod->sg_table, true);
+ nvme_complete_rq(req);
+}
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
+static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
+{
+ u32 queue_idx = nvme_loop_queue_idx(queue);
- blk_mq_end_request(req, error);
+ if (queue_idx == 0)
+ return queue->ctrl->admin_tag_set.tags[queue_idx];
+ return queue->ctrl->tag_set.tags[queue_idx - 1];
}
static void nvme_loop_queue_response(struct nvmet_req *req)
{
- struct nvme_loop_iod *iod =
- container_of(req, struct nvme_loop_iod, req);
- struct nvme_completion *cqe = &iod->rsp;
+ struct nvme_loop_queue *queue =
+ container_of(req->sq, struct nvme_loop_queue, nvme_sq);
+ struct nvme_completion *cqe = req->rsp;
/*
* AEN requests are special as they don't time out and can
@@ -125,13 +118,23 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
+ if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
- nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status,
+ nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
} else {
- struct request *rq = blk_mq_rq_from_pdu(iod);
+ struct request *rq;
+ struct nvme_loop_iod *iod;
+
+ rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "tag 0x%x on queue %d not found\n",
+ cqe->command_id, nvme_loop_queue_idx(queue));
+ return;
+ }
+ iod = blk_mq_rq_to_pdu(rq);
iod->nvme_req.result = cqe->result;
blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
}
@@ -268,7 +271,7 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops nvme_loop_mq_ops = {
+static const struct blk_mq_ops nvme_loop_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_request,
@@ -276,7 +279,7 @@ static struct blk_mq_ops nvme_loop_mq_ops = {
.timeout = nvme_loop_timeout,
};
-static struct blk_mq_ops nvme_loop_admin_mq_ops = {
+static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_admin_request,
@@ -349,6 +352,19 @@ out_destroy_queues:
return ret;
}
+static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+ int i, ret;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
{
int error;
@@ -490,7 +506,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, reset_work);
bool changed;
- int i, ret;
+ int ret;
nvme_loop_shutdown_ctrl(ctrl);
@@ -502,11 +518,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
if (ret)
goto out_destroy_admin;
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_destroy_io;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_destroy_io;
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
@@ -559,7 +573,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
{
- int ret, i;
+ int ret;
ret = nvme_loop_init_io_queues(ctrl);
if (ret)
@@ -588,11 +602,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
goto out_free_tagset;
}
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_cleanup_connect_q;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_cleanup_connect_q;
return 0;
@@ -736,7 +748,12 @@ static int __init nvme_loop_init_module(void)
ret = nvmet_register_transport(&nvme_loop_ops);
if (ret)
return ret;
- return nvmf_register_transport(&nvme_loop_transport);
+
+ ret = nvmf_register_transport(&nvme_loop_transport);
+ if (ret)
+ nvmet_unregister_transport(&nvme_loop_ops);
+
+ return ret;
}
static void __exit nvme_loop_cleanup_module(void)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index f7ff15f17ca9..7cb77ba5993b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -253,11 +253,11 @@ struct nvmet_async_event {
u8 log_page;
};
-int nvmet_parse_connect_cmd(struct nvmet_req *req);
-int nvmet_parse_io_cmd(struct nvmet_req *req);
-int nvmet_parse_admin_cmd(struct nvmet_req *req);
-int nvmet_parse_discovery_cmd(struct nvmet_req *req);
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
+u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
@@ -278,6 +278,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
struct nvmet_req *req, struct nvmet_ctrl **ret);
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
enum nvme_subsys_type type);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ecc4fe862561..99c69018a35f 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1199,6 +1199,11 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
}
queue->port = cm_id->context;
+ if (queue->host_qid == 0) {
+ /* Let inflight controller teardown complete */
+ flush_scheduled_work();
+ }
+
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
if (ret)
goto release_queue;
@@ -1427,12 +1432,16 @@ restart:
static int nvmet_rdma_add_port(struct nvmet_port *port)
{
struct rdma_cm_id *cm_id;
- struct sockaddr_in addr_in;
- u16 port_in;
+ struct sockaddr_storage addr = { };
+ __kernel_sa_family_t af;
int ret;
switch (port->disc_addr.adrfam) {
case NVMF_ADDR_FAMILY_IP4:
+ af = AF_INET;
+ break;
+ case NVMF_ADDR_FAMILY_IP6:
+ af = AF_INET6;
break;
default:
pr_err("address family %d not supported\n",
@@ -1440,13 +1449,13 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return -EINVAL;
}
- ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in);
- if (ret)
+ ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
+ port->disc_addr.trsvcid, &addr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s:%s\n",
+ port->disc_addr.traddr, port->disc_addr.trsvcid);
return ret;
-
- addr_in.sin_family = AF_INET;
- addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr);
- addr_in.sin_port = htons(port_in);
+ }
cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1455,20 +1464,32 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return PTR_ERR(cm_id);
}
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in);
+ /*
+ * Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+ ret = rdma_set_afonly(cm_id, 1);
+ if (ret) {
+ pr_err("rdma_set_afonly failed (%d)\n", ret);
+ goto out_destroy_id;
+ }
+
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
if (ret) {
- pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("binding CM ID to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("listening to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
- pr_info("enabling port %d (%pISpc)\n",
- le16_to_cpu(port->disc_addr.portid), &addr_in);
+ pr_info("enabling port %d (%pISpcs)\n",
+ le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
port->priv = cm_id;
return 0;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 6ff61dad5e21..62fed9dc893e 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -183,11 +183,33 @@ static void jsfd_read(char *buf, unsigned long p, size_t togo) {
}
}
-static void jsfd_do_request(struct request_queue *q)
+static int jsfd_queue;
+
+static struct request *jsfd_next_request(void)
+{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = jsfd_queue;
+
+ do {
+ q = jsfd_disk[jsfd_queue]->queue;
+ if (++jsfd_queue == JSF_MAX)
+ jsfd_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (jsfd_queue != old_pos);
+
+ return NULL;
+}
+
+static void jsfd_request(void)
{
struct request *req;
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
while (req) {
struct jsfd_part *jdp = req->rq_disk->private_data;
unsigned long offset = blk_rq_pos(req) << 9;
@@ -211,10 +233,15 @@ static void jsfd_do_request(struct request_queue *q)
err = 0;
end:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
}
}
+static void jsfd_do_request(struct request_queue *q)
+{
+ jsfd_request();
+}
+
/*
* The memory devices use the full 32/64 bits of the offset, and so we cannot
* check against negative addresses: they are ok. The return value is weird,
@@ -544,8 +571,6 @@ static int jsflash_init(void)
return 0;
}
-static struct request_queue *jsf_queue;
-
static int jsfd_init(void)
{
static DEFINE_SPINLOCK(lock);
@@ -562,6 +587,11 @@ static int jsfd_init(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
goto out;
+ disk->queue = blk_init_queue(jsfd_do_request, &lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ goto out;
+ }
jsfd_disk[i] = disk;
}
@@ -570,13 +600,6 @@ static int jsfd_init(void)
goto out;
}
- jsf_queue = blk_init_queue(jsfd_do_request, &lock);
- if (!jsf_queue) {
- err = -ENOMEM;
- unregister_blkdev(JSFD_MAJOR, "jsfd");
- goto out;
- }
-
for (i = 0; i < JSF_MAX; i++) {
struct gendisk *disk = jsfd_disk[i];
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
@@ -589,7 +612,6 @@ static int jsfd_init(void)
disk->fops = &jsfd_fops;
set_capacity(disk, jdp->dsize >> 9);
disk->private_data = jdp;
- disk->queue = jsf_queue;
add_disk(disk);
set_disk_ro(disk, 1);
}
@@ -619,6 +641,7 @@ static void __exit jsflash_cleanup_module(void)
for (i = 0; i < JSF_MAX; i++) {
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
del_gendisk(jsfd_disk[i]);
+ blk_cleanup_queue(jsfd_disk[i]->queue);
put_disk(jsfd_disk[i]);
}
if (jsf0.busy)
@@ -628,7 +651,6 @@ static void __exit jsflash_cleanup_module(void)
misc_deregister(&jsf_dev);
unregister_blkdev(JSFD_MAJOR, "jsfd");
- blk_cleanup_queue(jsf_queue);
}
module_init(jsflash_init_module);
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 6903f03c88af..9d0727b2bdec 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1602,7 +1602,7 @@ static int _init_blk_request(struct osd_request *or,
req->rq_flags |= RQF_QUIET;
req->timeout = or->timeout;
- req->retries = or->retries;
+ scsi_req(req)->retries = or->retries;
if (has_out) {
or->out.req = req;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index c47f4b349bac..41bc1d64bf86 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f2cafae150bc..2db412dd4b44 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
req->rq_flags |= RQF_QUIET;
req->timeout = 10 * HZ;
- req->retries = 5;
+ rq->retries = 5;
blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e5a2d590a104..7bc4513bf4e4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -256,7 +256,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
rq->cmd_len = COMMAND_SIZE(cmd[0]);
memcpy(rq->cmd, cmd, rq->cmd_len);
- req->retries = retries;
+ rq->retries = retries;
req->timeout = timeout;
req->cmd_flags |= flags;
req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
cmd->cmd_len = scsi_req(req)->cmd_len;
cmd->cmnd = scsi_req(req)->cmd;
cmd->transfersize = blk_rq_bytes(req);
- cmd->allowed = req->retries;
+ cmd->allowed = scsi_req(req)->retries;
return BLKPREP_OK;
}
@@ -2154,7 +2154,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
return q;
}
-static struct blk_mq_ops scsi_mq_ops = {
+static const struct blk_mq_ops scsi_mq_ops = {
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 29b86505f796..b61cc3c512d3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1716,7 +1716,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
srp->rq = rq;
rq->end_io_data = srp;
- rq->retries = SG_DEFAULT_RETRIES;
+ req->retries = SG_DEFAULT_RETRIES;
if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e5ef78a6848e..5408643431bb 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB);
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index bf40f03755dd..f30c27b83c5e 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -167,10 +167,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
struct iscsi_portal_group *tpg;
struct iscsi_tpg_np *tpg_np;
char *str, *str2, *ip_str, *port_str;
- struct sockaddr_storage sockaddr;
- struct sockaddr_in *sock_in;
- struct sockaddr_in6 *sock_in6;
- unsigned long port;
+ struct sockaddr_storage sockaddr = { };
int ret;
char buf[MAX_PORTAL_LEN + 1];
@@ -182,21 +179,19 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
memset(buf, 0, MAX_PORTAL_LEN + 1);
snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
- memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
-
str = strstr(buf, "[");
if (str) {
- const char *end;
-
str2 = strstr(str, "]");
if (!str2) {
pr_err("Unable to locate trailing \"]\""
" in IPv6 iSCSI network portal address\n");
return ERR_PTR(-EINVAL);
}
- str++; /* Skip over leading "[" */
+
+ ip_str = str + 1; /* Skip over leading "[" */
*str2 = '\0'; /* Terminate the unbracketed IPv6 address */
str2++; /* Skip over the \0 */
+
port_str = strstr(str2, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -205,23 +200,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
-
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in6 = (struct sockaddr_in6 *)&sockaddr;
- sock_in6->sin6_family = AF_INET6;
- sock_in6->sin6_port = htons((unsigned short)port);
- ret = in6_pton(str, -1,
- (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
- if (ret <= 0) {
- pr_err("in6_pton returned: %d\n", ret);
- return ERR_PTR(-EINVAL);
- }
} else {
- str = ip_str = &buf[0];
+ ip_str = &buf[0];
port_str = strstr(ip_str, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -230,17 +210,15 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
+ }
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in = (struct sockaddr_in *)&sockaddr;
- sock_in->sin_family = AF_INET;
- sock_in->sin_port = htons((unsigned short)port);
- sock_in->sin_addr.s_addr = in_aton(ip_str);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC, ip_str,
+ port_str, &sockaddr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s\n", name);
+ return ERR_PTR(ret);
}
+
tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
ret = iscsit_get_tpg(tpg);
if (ret < 0)
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 94cda7991e80..c7fa372c527a 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
req->timeout = PS_TIMEOUT_DISK;
else
req->timeout = PS_TIMEOUT_OTHER;
- req->retries = PS_RETRY;
+ scsi_req(req)->retries = PS_RETRY;
blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
(cmd->sam_task_attr == TCM_HEAD_TAG),
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..f2d59f143ef4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
+ /* Detach inode from wb early as bdi_put() may free bdi->wb */
+ inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
@@ -1556,8 +1558,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
if (!partno) {
ret = -ENXIO;
@@ -1622,6 +1622,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
+
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
@@ -1653,8 +1656,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1876,12 +1877,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
kill_bdev(bdev);
bdev_write_inode(bdev);
- /*
- * Detaching bdev inode from its wb in __destroy_inode()
- * is too late: the queue which embeds its bdi (along with
- * root wb) can be gone as soon as we put_disk() below.
- */
- inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d..e66d4722db8e 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
*/
enum wb_state {
WB_registered, /* bdi_register() was done */
+ WB_shutting_down, /* wb_shutdown() in progress */
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
};
@@ -54,7 +55,9 @@ struct bdi_writeback_congested {
atomic_t refcnt; /* nr of attached wb's and blkg */
#ifdef CONFIG_CGROUP_WRITEBACK
- struct backing_dev_info *bdi; /* the associated bdi */
+ struct backing_dev_info *__bdi; /* the associated bdi, set to NULL
+ * on bdi unregistration. For memcg-wb
+ * internal use only! */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
@@ -161,7 +164,6 @@ struct backing_dev_info {
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
- atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#else
struct bdi_writeback_congested *wb_congested;
#endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..4931756d86d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
extern void bioset_free(struct bio_set *);
extern mempool_t *biovec_create_pool(int pool_entries);
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9382c5da7a2e..b90c3d5766cd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -82,7 +82,6 @@ struct blk_mq_tag_set {
struct blk_mq_queue_data {
struct request *rq;
- struct list_head *list;
bool last;
};
@@ -153,7 +152,6 @@ enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2,
- BLK_MQ_F_DEFER_ISSUE = 1 << 4,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -246,7 +244,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f..72aa9519167e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
+struct blk_issue_stat {
+ u64 stat;
+};
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
@@ -29,7 +33,7 @@ struct bio {
* top bits REQ_OP. Use
* accessors.
*/
- unsigned short bi_flags; /* status, command, etc */
+ unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
struct bvec_iter bi_iter;
@@ -58,6 +62,10 @@ struct bio {
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ void *bi_cg_private;
+ struct blk_issue_stat bi_issue_stat;
+#endif
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -102,12 +110,9 @@ struct bio {
#define BIO_REFFED 8 /* bio has elevated ->bi_cnt */
#define BIO_THROTTLED 9 /* This bio has already been subjected to
* throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS 10
+#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
+ * of this bio. */
+/* See BVEC_POOL_OFFSET below before adding new flags */
/*
* We support 6 different bvec pools, the last one is magic in that it
@@ -117,13 +122,22 @@ struct bio {
#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
/*
- * Top 4 bits of bio flags indicate the pool the bvecs came from. We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from. We add
* 1 to the actual index so that 0 indicates that there are no bvecs to be
* freed.
*/
-#define BVEC_POOL_BITS (4)
+#define BVEC_POOL_BITS (3)
#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS BVEC_POOL_OFFSET
/*
* Operations and flags common to the bio and request structures.
@@ -283,12 +297,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
return (cookie & BLK_QC_T_INTERNAL) != 0;
}
-struct blk_issue_stat {
- u64 time;
-};
-
-#define BLK_RQ_STAT_BATCH 64
-
struct blk_rq_stat {
s64 mean;
u64 min;
@@ -296,7 +304,6 @@ struct blk_rq_stat {
s32 nr_samples;
s32 nr_batch;
u64 batch;
- s64 time;
};
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7548f332121a..d76bebbc632e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,6 +40,8 @@ struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
@@ -213,6 +215,8 @@ struct request {
unsigned short ioprio;
+ unsigned int timeout;
+
void *special; /* opaque pointer available for LLD use */
int errors;
@@ -221,8 +225,6 @@ struct request {
unsigned long deadline;
struct list_head timeout_list;
- unsigned int timeout;
- int retries;
/*
* completion callback.
@@ -388,6 +390,7 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct blk_queue_stats *stats;
struct rq_wb *rq_wb;
/*
@@ -505,8 +508,6 @@ struct request_queue {
unsigned int nr_sorted;
unsigned int in_flight[2];
- struct blk_rq_stat rq_stats[2];
-
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
@@ -516,6 +517,10 @@ struct request_queue {
unsigned int rq_timeout;
int poll_nsec;
+
+ struct blk_stat_callback *poll_cb;
+ struct blk_rq_stat poll_stat[2];
+
struct timer_list timeout;
struct work_struct timeout_work;
struct list_head timeout_list;
@@ -610,6 +615,8 @@ struct request_queue {
#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */
#define QUEUE_FLAG_DAX 26 /* device supports DAX */
#define QUEUE_FLAG_STATS 27 /* track rq completion times */
+#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_STACKABLE) | \
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..9e11082c7f9b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct blk_integrity {
- struct blk_integrity_profile *profile;
- unsigned char flags;
- unsigned char tuple_size;
- unsigned char interval_exp;
- unsigned char tag_size;
+ const struct blk_integrity_profile *profile;
+ unsigned char flags;
+ unsigned char tuple_size;
+ unsigned char interval_exp;
+ unsigned char tag_size;
};
#endif /* CONFIG_BLK_DEV_INTEGRITY */
diff --git a/include/linux/inet.h b/include/linux/inet.h
index 4cca05c9678e..636ebe87e6f8 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -43,6 +43,8 @@
#define _LINUX_INET_H
#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
/*
* These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
extern __be32 in_aton(const char *str);
extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+ const char *src, const char *port, struct sockaddr_storage *addr);
+
#endif /* _LINUX_INET_H */
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599e..ca85cb80e99a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
extern int __must_check kobject_move(struct kobject *, struct kobject *);
extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+ struct kobject *kobj);
extern void kobject_put(struct kobject *kobj);
extern const void *kobject_namespace(struct kobject *kobj);
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f21471f7ee40..16eb264980c2 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
* transferred. Should equal payload_length on success.
* @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
* @status: Completion status of the FCP operation. must be 0 upon success,
- * NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- * reflection of the NVME CQE completion status. Only the status
- * of the FCP operation at the NVME-FC level.
+ * negative errno value upon failure (ex: -EIO). Note: this is
+ * NOT a reflection of the NVME CQE completion status. Only the
+ * status of the FCP operation at the NVME-FC level.
*/
struct nvmefc_fcp_req {
void *cmdaddr;
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 4b45226bd604..e997c4a49a88 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -16,8 +16,7 @@
*/
/*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
*/
#ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
#define NVME_FC_SIZEOF_ZEROS_RSP 12
+enum {
+ FCNVME_SC_SUCCESS = 0,
+ FCNVME_SC_INVALID_FIELD = 1,
+ FCNVME_SC_INVALID_CONNID = 2,
+};
+
struct nvme_fc_ersp_iu {
- __u8 rsvd0[2];
+ __u8 status_code;
+ __u8 rsvd1;
__be16 iu_len;
__be32 rsn;
__be32 xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
};
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
enum {
FCNVME_LS_RSVD = 0,
FCNVME_LS_RJT = 1,
@@ -68,7 +74,7 @@ enum {
FCNVME_LS_DISCONNECT = 5,
};
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
enum {
FCNVME_LSDESC_RSVD = 0x0,
FCNVME_LSDESC_RQST = 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
return cpu_to_be32(sz - (2 * sizeof(u32)));
}
-
struct fcnvme_ls_rqst_w0 {
u8 ls_cmd; /* FCNVME_LS_xxx */
u8 zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
__be32 rsvd12;
};
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+ FCNVME_RJT_RC_NONE = 0,
+ /* no reason - not to be sent */
+
+ FCNVME_RJT_RC_INVAL = 0x01,
+ /* invalid NVMe_LS command code */
+
+ FCNVME_RJT_RC_LOGIC = 0x03,
+ /* logical error */
+
+ FCNVME_RJT_RC_UNAB = 0x09,
+ /* unable to perform command request */
+
+ FCNVME_RJT_RC_UNSUP = 0x0b,
+ /* command not supported */
+
+ FCNVME_RJT_RC_INPROG = 0x0e,
+ /* command already in progress */
+ FCNVME_RJT_RC_INV_ASSOC = 0x40,
+ /* Invalid Association ID*/
+ FCNVME_RJT_RC_INV_CONN = 0x41,
+ /* Invalid Connection ID*/
+
+ FCNVME_RJT_RC_VENDOR = 0xff,
+ /* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+ FCNVME_RJT_EXP_NONE = 0x00,
+ /* No additional explanation */
+
+ FCNVME_RJT_EXP_OXID_RXID = 0x17,
+ /* invalid OX_ID-RX_ID combination */
+
+ FCNVME_RJT_EXP_INSUF_RES = 0x29,
+ /* insufficient resources */
+
+ FCNVME_RJT_EXP_UNAB_DATA = 0x2a,
+ /* unable to supply requested data */
+
+ FCNVME_RJT_EXP_INV_LEN = 0x2d,
+ /* Invalid payload length */
+};
/* FCNVME_LSDESC_RJT */
struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
* Reject reason and explanaction codes are generic
* to ELs's from LS-3.
*/
- u8 reason_code;
- u8 reason_explanation;
+ u8 reason_code; /* fcnvme_ls_rjt_reason */
+ u8 reason_explanation; /* fcnvme_ls_rjt_explan */
u8 vendor;
__be32 rsvd12;
};
-#define FCNVME_ASSOC_HOSTID_LEN 64
+#define FCNVME_ASSOC_HOSTID_LEN 16
#define FCNVME_ASSOC_HOSTNQN_LEN 256
#define FCNVME_ASSOC_SUBNQN_LEN 256
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd33544..9375d23a24e7 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
};
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a3c0cbd7c888..d5815794416c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
+ WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index ba0aeb980f7e..7c583a0f363a 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -11,6 +11,7 @@ struct scsi_request {
unsigned short cmd_len;
unsigned int sense_len;
unsigned int resid_len; /* residual count */
+ int retries;
void *sense;
};
diff --git a/lib/kobject.c b/lib/kobject.c
index 445dcaeb0f56..763d70a18941 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
}
EXPORT_SYMBOL(kobject_get);
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
+ if (!kobj)
+ return NULL;
if (!kref_get_unless_zero(&kobj->kref))
kobj = NULL;
return kobj;
}
+EXPORT_SYMBOL(kobject_get_unless_zero);
/*
* kobject_cleanup - free kobject resources.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..3ea3bbd921d6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,13 +801,23 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
int bdi_init(struct backing_dev_info *bdi)
@@ -802,8 +836,6 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
EXPORT_SYMBOL(bdi_init);
@@ -839,6 +871,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -892,7 +925,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
diff --git a/net/core/utils.c b/net/core/utils.c
index 6592d7bbed39..f96cf527bb8f 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -26,9 +26,11 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/ratelimit.h>
+#include <linux/socket.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
+#include <net/ipv6.h>
#include <asm/byteorder.h>
#include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
}
EXPORT_SYMBOL(in6_pton);
+static int inet4_pton(const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ int srclen = strlen(src);
+
+ if (srclen > INET_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+ '\n', NULL) == 0)
+ return -EINVAL;
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(port_num);
+
+ return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+ const char *scope_delim;
+ int srclen = strlen(src);
+
+ if (srclen > INET6_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+ '%', &scope_delim) == 0)
+ return -EINVAL;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+ src + srclen != scope_delim && *scope_delim == '%') {
+ struct net_device *dev;
+ char scope_id[16];
+ size_t scope_len = min_t(size_t, sizeof(scope_id),
+ src + srclen - scope_delim - 1);
+
+ memcpy(scope_id, scope_delim + 1, scope_len);
+ scope_id[scope_len] = '\0';
+
+ dev = dev_get_by_name(net, scope_id);
+ if (dev) {
+ addr6->sin6_scope_id = dev->ifindex;
+ dev_put(dev);
+ } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+ return -EINVAL;
+ }
+ }
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(port_num);
+
+ return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+ const char *src, const char *port, struct sockaddr_storage *addr)
+{
+ u16 port_num;
+ int ret = -EINVAL;
+
+ if (port) {
+ if (kstrtou16(port, 0, &port_num))
+ return -EINVAL;
+ } else {
+ port_num = 0;
+ }
+
+ switch (af) {
+ case AF_INET:
+ ret = inet4_pton(src, port_num, addr);
+ break;
+ case AF_INET6:
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ case AF_UNSPEC:
+ ret = inet4_pton(src, port_num, addr);
+ if (ret)
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ default:
+ pr_err("unexpected address family %d\n", af);
+ };
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, bool pseudohdr)
{