aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bfq-iosched.c158
-rw-r--r--block/bfq-iosched.h17
-rw-r--r--block/bfq-wf2q.c3
-rw-r--r--block/bio-integrity.c1
-rw-r--r--block/blk-core.c59
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-lib.c12
-rw-r--r--block/blk-map.c4
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq-sched.h2
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq.c158
-rw-r--r--block/blk-mq.h3
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c144
-rw-r--r--block/blk.h3
-rw-r--r--block/elevator.c10
-rw-r--r--block/genhd.c23
-rw-r--r--drivers/block/aoe/aoe.h3
-rw-r--r--drivers/block/aoe/aoecmd.c48
-rw-r--r--drivers/md/dm-rq.c28
-rw-r--r--drivers/md/dm.c11
-rw-r--r--include/linux/blk-cgroup.h8
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/genhd.h5
25 files changed, 559 insertions, 203 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f352b1677143..47e6ec7427c4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -209,15 +209,17 @@ static struct kmem_cache *bfq_pool;
* interactive applications automatically, using the following formula:
* duration = (R / r) * T, where r is the peak rate of the device, and
* R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
+ * In particular, R is the peak rate of the reference device (see
+ * below), and T is a reference time: given the systems that are
+ * likely to be installed on the reference device according to its
+ * speed class, T is about the maximum time needed, under BFQ and
+ * while reading two files in parallel, to load typical large
+ * applications on these systems (see the comments on
+ * max_service_from_wr below, for more details on how T is obtained).
+ * In practice, the slower/faster the device at hand is, the more/less
+ * it takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to
+ * interactive applications.
*
* BFQ uses four different reference pairs (R, T), depending on:
* . whether the device is rotational or non-rotational;
@@ -254,6 +256,60 @@ static int T_slow[2];
static int T_fast[2];
static int device_speed_thresh[2];
+/*
+ * BFQ uses the above-detailed, time-based weight-raising mechanism to
+ * privilege interactive tasks. This mechanism is vulnerable to the
+ * following false positives: I/O-bound applications that will go on
+ * doing I/O for much longer than the duration of weight
+ * raising. These applications have basically no benefit from being
+ * weight-raised at the beginning of their I/O. On the opposite end,
+ * while being weight-raised, these applications
+ * a) unjustly steal throughput to applications that may actually need
+ * low latency;
+ * b) make BFQ uselessly perform device idling; device idling results
+ * in loss of device throughput with most flash-based storage, and may
+ * increase latencies when used purposelessly.
+ *
+ * BFQ tries to reduce these problems, by adopting the following
+ * countermeasure. To introduce this countermeasure, we need first to
+ * finish explaining how the duration of weight-raising for
+ * interactive tasks is computed.
+ *
+ * For a bfq_queue deemed as interactive, the duration of weight
+ * raising is dynamically adjusted, as a function of the estimated
+ * peak rate of the device, so as to be equal to the time needed to
+ * execute the 'largest' interactive task we benchmarked so far. By
+ * largest task, we mean the task for which each involved process has
+ * to do more I/O than for any of the other tasks we benchmarked. This
+ * reference interactive task is the start-up of LibreOffice Writer,
+ * and in this task each process/bfq_queue needs to have at most ~110K
+ * sectors transferred.
+ *
+ * This last piece of information enables BFQ to reduce the actual
+ * duration of weight-raising for at least one class of I/O-bound
+ * applications: those doing sequential or quasi-sequential I/O. An
+ * example is file copy. In fact, once started, the main I/O-bound
+ * processes of these applications usually consume the above 110K
+ * sectors in much less time than the processes of an application that
+ * is starting, because these I/O-bound processes will greedily devote
+ * almost all their CPU cycles only to their target,
+ * throughput-friendly I/O operations. This is even more true if BFQ
+ * happens to be underestimating the device peak rate, and thus
+ * overestimating the duration of weight raising. But, according to
+ * our measurements, once transferred 110K sectors, these processes
+ * have no right to be weight-raised any longer.
+ *
+ * Basing on the last consideration, BFQ ends weight-raising for a
+ * bfq_queue if the latter happens to have received an amount of
+ * service at least equal to the following constant. The constant is
+ * set to slightly more than 110K, to have a minimum safety margin.
+ *
+ * This early ending of weight-raising reduces the amount of time
+ * during which interactive false positives cause the two problems
+ * described at the beginning of these comments.
+ */
+static const unsigned long max_service_from_wr = 120000;
+
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
@@ -417,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function.
+ */
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+{
+ bfqd->sb_shift = bt->sb.shift;
+
+ /*
+ * In-word depths if no bfq_queue is being weight-raised:
+ * leaving 25% of tags only for sync reads.
+ *
+ * In next formulas, right-shift the value
+ * (1U<<bfqd->sb_shift), instead of computing directly
+ * (1U<<(bfqd->sb_shift - something)), to be robust against
+ * any possible value of bfqd->sb_shift, without having to
+ * limit 'something'.
+ */
+ /* no more than 50% of tags for async I/O */
+ bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
+ /*
+ * no more than 75% of tags for sync writes (25% extra tags
+ * w.r.t. async I/O, to prevent async I/O from starving sync
+ * writes)
+ */
+ bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
+
+ /*
+ * In-word depths in case some bfq_queue is being weight-
+ * raised: leaving ~63% of tags for sync reads. This is the
+ * highest percentage for which, in our tests, application
+ * start-up times didn't suffer from any regression due to tag
+ * shortage.
+ */
+ /* no more than ~18% of tags for async I/O */
+ bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
+ /* no more than ~37% of tags for sync writes (~20% extra tags) */
+ bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
+}
+
+/*
+ * Async I/O can easily starve sync I/O (both sync reads and sync
+ * writes), by consuming all tags. Similarly, storms of sync writes,
+ * such as those that sync(2) may trigger, can starve sync reads.
+ * Limit depths of async I/O and sync writes so as to counter both
+ * problems.
+ */
+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct sbitmap_queue *bt;
+
+ if (op_is_sync(op) && !op_is_write(op))
+ return;
+
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ bt = &tags->breserved_tags;
+ } else
+ bt = &tags->bitmap_tags;
+
+ if (unlikely(bfqd->sb_shift != bt->sb.shift))
+ bfq_update_depths(bfqd, bt);
+
+ data->shallow_depth =
+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+
+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+ __func__, bfqd->wr_busy_queues, op_is_sync(op),
+ data->shallow_depth);
+}
+
static struct bfq_queue *
bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
sector_t sector, struct rb_node **ret_parent,
@@ -1276,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
if (old_wr_coeff == 1 && wr_or_deserves_wr) {
/* start a weight-raising period */
if (interactive) {
+ bfqq->service_from_wr = 0;
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
} else {
@@ -3589,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->entity.prio_changed = 1;
}
}
+ if (bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
+ bfqq->service_from_wr > max_service_from_wr) {
+ /* see comments on max_service_from_wr */
+ bfq_bfqq_end_wr(bfqq);
+ }
}
/*
* To improve latency (for this or other queues), immediately
@@ -5285,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
static struct elevator_type iosched_bfq_mq = {
.ops.mq = {
+ .limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5d47b58d5fc8..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
* last transition from idle to backlogged.
*/
unsigned long service_from_backlogged;
+ /*
+ * Cumulative service received from the @bfq_queue since its
+ * last transition to weight-raised state.
+ */
+ unsigned long service_from_wr;
/*
* Value of wr start time when switching to soft rt
@@ -629,6 +634,18 @@ struct bfq_data {
struct bfq_io_cq *bio_bic;
/* bfqq associated with the task issuing current bio for merging */
struct bfq_queue *bio_bfqq;
+
+ /*
+ * Cached sbitmap shift, used to compute depth limits in
+ * bfq_update_depths.
+ */
+ unsigned int sb_shift;
+
+ /*
+ * Depth limits used in bfq_limit_depth (see comments on the
+ * function)
+ */
+ unsigned int word_depths[2][2];
};
enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4456eda34e48..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -838,6 +838,9 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
if (!bfqq->service_from_backlogged)
bfqq->first_IO_time = jiffies;
+ if (bfqq->wr_coeff > 1)
+ bfqq->service_from_wr += served;
+
bfqq->service_from_backlogged += served;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 23b42e8aa03e..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
- * @error: Pointer to errno
*
* Description: Completion for integrity I/O
*
diff --git a/block/blk-core.c b/block/blk-core.c
index 7ba607527487..c21a16e9fdf9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2062,6 +2062,21 @@ static inline bool should_fail_request(struct hd_struct *part,
#endif /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+{
+ if (part->policy && op_is_write(bio_op(bio))) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_ERR
+ "generic_make_request: Trying to write "
+ "to read-only block-device %s (partno %d)\n",
+ bio_devname(bio, b), part->partno);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
@@ -2070,27 +2085,28 @@ static inline int blk_partition_remap(struct bio *bio)
struct hd_struct *p;
int ret = 0;
+ rcu_read_lock();
+ p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+ bio_check_ro(bio, p))) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed.
*/
- if (!bio->bi_partno ||
- (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
- return 0;
+ if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+ goto out;
- rcu_read_lock();
- p = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
- bio->bi_iter.bi_sector += p->start_sect;
- bio->bi_partno = 0;
- trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
- bio->bi_iter.bi_sector - p->start_sect);
- } else {
- printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
- ret = -EIO;
- }
- rcu_read_unlock();
+ bio->bi_iter.bi_sector += p->start_sect;
+ bio->bi_partno = 0;
+ trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+ bio->bi_iter.bi_sector - p->start_sect);
+out:
+ rcu_read_unlock();
return ret;
}
@@ -2149,15 +2165,19 @@ generic_make_request_checks(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
*/
-
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
goto end_io;
- if (blk_partition_remap(bio))
- goto end_io;
+ if (!bio->bi_partno) {
+ if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ goto end_io;
+ } else {
+ if (blk_partition_remap(bio))
+ goto end_io;
+ }
if (bio_check_eod(bio, nr_sectors))
goto end_io;
@@ -2500,8 +2520,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_request_bypass_insert(rq, true);
- return BLK_STS_OK;
+ return blk_mq_request_direct_issue(rq);
}
spin_lock_irqsave(q->queue_lock, flags);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
- blk_mq_sched_insert_request(rq, at_head, true, false, false);
+ blk_mq_sched_insert_request(rq, at_head, true, false);
return;
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2bc544ce3d2e..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
if (!q)
return -ENXIO;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
while (nr_sects != 0) {
bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask);
diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..209eb3b45c54 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -114,7 +114,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
- int ret;
+ int ret = -EINVAL;
if (!iter_is_iovec(iter))
goto fail;
@@ -143,7 +143,7 @@ unmap_rq:
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
- return -EINVAL;
+ return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2ff7cf0cbf73..55c0a745b427 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -427,7 +427,7 @@ done:
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async, bool can_block)
+ bool run_queue, bool async)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async, bool can_block);
+ bool run_queue, bool async);
void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
q->mq_sysfs_init_done = false;
}
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
- mutex_lock(&q->sysfs_lock);
- __blk_mq_unregister_dev(dev, q);
- mutex_unlock(&q->sysfs_lock);
-}
-
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c8f62e6be6b6..74a4f237ba91 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -745,13 +745,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, true, false, false, true);
+ blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false, true);
+ blk_mq_sched_insert_request(rq, false, false, false);
}
blk_mq_run_hw_queues(q, false);
@@ -1294,9 +1294,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
/*
* We should be running this queue from one of the CPUs that
* are mapped to it.
+ *
+ * There are at least two related races now between setting
+ * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+ * __blk_mq_run_hw_queue():
+ *
+ * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+ * but later it becomes online, then this warning is harmless
+ * at all
+ *
+ * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+ * but later it becomes offline, then the warning can't be
+ * triggered, and we depend on blk-mq timeout handler to
+ * handle dispatched requests to this hctx
*/
- WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
- cpu_online(hctx->next_cpu));
+ if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+ cpu_online(hctx->next_cpu)) {
+ printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+ raw_smp_processor_id(),
+ cpumask_empty(hctx->cpumask) ? "inactive": "active");
+ dump_stack();
+ }
/*
* We can't run the queue inline with ints disabled. Ensure that
@@ -1319,21 +1337,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
+ bool tried = false;
+
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
int next_cpu;
-
+select_cpu:
next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
- hctx->next_cpu = next_cpu;
+ /*
+ * No online CPU is found, so have to make sure hctx->next_cpu
+ * is set correctly for not breaking workqueue.
+ */
+ if (next_cpu >= nr_cpu_ids)
+ hctx->next_cpu = cpumask_first(hctx->cpumask);
+ else
+ hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
+ /*
+ * Do unbound schedule if we can't find a online CPU for this hctx,
+ * and it should only happen in the path of handling CPU DEAD.
+ */
+ if (!cpu_online(hctx->next_cpu)) {
+ if (!tried) {
+ tried = true;
+ goto select_cpu;
+ }
+
+ /*
+ * Make sure to re-select CPU next time once after CPUs
+ * in hctx->cpumask become online again.
+ */
+ hctx->next_cpu_batch = 1;
+ return WORK_CPU_UNBOUND;
+ }
return hctx->next_cpu;
}
@@ -1694,9 +1738,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie)
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
@@ -1705,15 +1749,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
};
blk_qc_t new_cookie;
blk_status_t ret;
+
+ new_cookie = request_to_qc_t(hctx, rq);
+
+ /*
+ * For OK queue, we are done. For error, caller may kill it.
+ * Any other error (busy), just add it to our list as we
+ * previously would have done.
+ */
+ ret = q->mq_ops->queue_rq(hctx, &bd);
+ switch (ret) {
+ case BLK_STS_OK:
+ *cookie = new_cookie;
+ break;
+ case BLK_STS_RESOURCE:
+ __blk_mq_requeue_request(rq);
+ break;
+ default:
+ *cookie = BLK_QC_T_NONE;
+ break;
+ }
+
+ return ret;
+}
+
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie,
+ bool bypass_insert)
+{
+ struct request_queue *q = rq->q;
bool run_queue = true;
- /* RCU or SRCU read lock is needed before checking quiesced flag */
+ /*
+ * RCU or SRCU read lock is needed before checking quiesced flag.
+ *
+ * When queue is stopped or quiesced, ignore 'bypass_insert' from
+ * blk_mq_request_direct_issue(), and return BLK_STS_OK to caller,
+ * and avoid driver to try to dispatch again.
+ */
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
+ bypass_insert = false;
goto insert;
}
- if (q->elevator)
+ if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_driver_tag(rq, NULL, false))
@@ -1724,42 +1805,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
goto insert;
}
- new_cookie = request_to_qc_t(hctx, rq);
-
- /*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
- */
- ret = q->mq_ops->queue_rq(hctx, &bd);
- switch (ret) {
- case BLK_STS_OK:
- *cookie = new_cookie;
- return;
- case BLK_STS_RESOURCE:
- __blk_mq_requeue_request(rq);
- goto insert;
- default:
- *cookie = BLK_QC_T_NONE;
- blk_mq_end_request(rq, ret);
- return;
- }
-
+ return __blk_mq_issue_directly(hctx, rq, cookie);
insert:
- blk_mq_sched_insert_request(rq, false, run_queue, false,
- hctx->flags & BLK_MQ_F_BLOCKING);
+ if (bypass_insert)
+ return BLK_STS_RESOURCE;
+
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
+ return BLK_STS_OK;
}
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
+ blk_status_t ret;
int srcu_idx;
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
- __blk_mq_try_issue_directly(hctx, rq, cookie);
+
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+ if (ret == BLK_STS_RESOURCE)
+ blk_mq_sched_insert_request(rq, false, true, false);
+ else if (ret != BLK_STS_OK)
+ blk_mq_end_request(rq, ret);
+
+ hctx_unlock(hctx, srcu_idx);
+}
+
+blk_status_t blk_mq_request_direct_issue(struct request *rq)
+{
+ blk_status_t ret;
+ int srcu_idx;
+ blk_qc_t unused_cookie;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+ hctx_lock(hctx, &srcu_idx);
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
hctx_unlock(hctx, srcu_idx);
+
+ return ret;
}
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1870,7 +1956,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
} else if (q->elevator) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
} else {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 8591a54d989b..e3ebc93646ca 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -74,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_direct_issue(struct request *rq);
+
/*
* CPU -> queue mappings
*/
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 870484eaed1f..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
+/**
+ * blk_register_queue - register a block layer queue with sysfs
+ * @disk: Disk of which the request queue should be registered with sysfs.
+ */
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
if (ret) {
+ mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
- goto unlock;
+ return ret;
}
}
ret = 0;
@@ -921,7 +926,15 @@ unlock:
mutex_unlock(&q->sysfs_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(blk_register_queue);
+/**
+ * blk_unregister_queue - counterpart of blk_register_queue()
+ * @disk: Disk of which the request queue should be unregistered from sysfs.
+ *
+ * Note: the caller is responsible for guaranteeing that this function is called
+ * after blk_register_queue() has finished.
+ */
void blk_unregister_queue(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
- mutex_lock(&q->sysfs_lock);
- queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
- mutex_unlock(&q->sysfs_lock);
+ /* Return early if disk->queue was never registered. */
+ if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return;
- wbt_exit(q);
+ /*
+ * Since sysfs_remove_dir() prevents adding new directory entries
+ * before removal of existing entries starts, protect against
+ * concurrent elv_iosched_store() calls.
+ */
+ mutex_lock(&q->sysfs_lock);
+ spin_lock_irq(q->queue_lock);
+ queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+ spin_unlock_irq(q->queue_lock);
+ /*
+ * Remove the sysfs attributes before unregistering the queue data
+ * structures that can be modified through sysfs.
+ */
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- if (q->request_fn || (q->mq_ops && q->elevator))
- elv_unregister_queue(q);
+ mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
+
+ wbt_exit(q);
+
+ mutex_lock(&q->sysfs_lock);
+ if (q->request_fn || (q->mq_ops && q->elevator))
+ elv_unregister_queue(q);
+ mutex_unlock(&q->sysfs_lock);
+
kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..e136f5ef9577 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,9 +216,9 @@ struct throtl_data
unsigned int scale;
- struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets;
+ struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
.seq_show = blkg_print_stat_bytes,
},
{
+ .name = "throttle.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
+ {
.name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
+ {
+ .name = "throttle.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
{ } /* terminate */
};
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
- struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
- int i, cpu;
- unsigned long last_latency = 0;
- unsigned long latency;
+ struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+ int i, cpu, rw;
+ unsigned long last_latency[2] = { 0 };
+ unsigned long latency[2];
if (!blk_queue_nonrot(td->queue))
return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency));
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets, cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets[rw],
+ cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
- latency = tmp->total_latency;
+ latency[rw] = tmp->total_latency;
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency /= samples;
- if (latency == 0)
- continue;
- avg_latency[i].latency = latency;
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency[rw] /= samples;
+ if (latency[rw] == 0)
+ continue;
+ avg_latency[rw][i].latency = latency[rw];
+ }
}
}
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[i].latency) {
- if (td->avg_buckets[i].latency < last_latency)
- td->avg_buckets[i].latency = last_latency;
- continue;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[rw][i].latency) {
+ if (td->avg_buckets[rw][i].latency < last_latency[rw])
+ td->avg_buckets[rw][i].latency =
+ last_latency[rw];
+ continue;
+ }
- if (!td->avg_buckets[i].valid)
- latency = avg_latency[i].latency;
- else
- latency = (td->avg_buckets[i].latency * 7 +
- avg_latency[i].latency) >> 3;
+ if (!td->avg_buckets[rw][i].valid)
+ latency[rw] = avg_latency[rw][i].latency;
+ else
+ latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+ avg_latency[rw][i].latency) >> 3;
- td->avg_buckets[i].latency = max(latency, last_latency);
- td->avg_buckets[i].valid = true;
- last_latency = td->avg_buckets[i].latency;
+ td->avg_buckets[rw][i].latency = max(latency[rw],
+ last_latency[rw]);
+ td->avg_buckets[rw][i].valid = true;
+ last_latency[rw] = td->avg_buckets[rw][i].latency;
+ }
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue,
- "Latency bucket %d: latency=%ld, valid=%d", i,
- td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+ "Latency bucket %d: read latency=%ld, read valid=%d, "
+ "write latency=%ld, write valid=%d", i,
+ td->avg_buckets[READ][i].latency,
+ td->avg_buckets[READ][i].valid,
+ td->avg_buckets[WRITE][i].latency,
+ td->avg_buckets[WRITE][i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2248,16 +2269,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
struct latency_bucket *latency;
int index;
- if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ if (!td || td->limit_index != LIMIT_LOW ||
+ !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue))
return;
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets);
+ latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets);
+ put_cpu_ptr(td->latency_buckets[op]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2276,6 +2298,7 @@ void blk_throtl_bio_endio(struct bio *bio)
unsigned long finish_time;
unsigned long start_time;
unsigned long lat;
+ int rw = bio_data_dir(bio);
tg = bio->bi_cg_private;
if (!tg)
@@ -2304,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat));
- threshold = tg->td->avg_buckets[bucket].latency +
+ threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
tg->bad_bio_cnt++;
@@ -2397,9 +2420,16 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
- td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets) {
+ if (!td->latency_buckets[READ]) {
+ kfree(td);
+ return -ENOMEM;
+ }
+ td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets[WRITE]) {
+ free_percpu(td->latency_buckets[READ]);
kfree(td);
return -ENOMEM;
}
@@ -2418,7 +2448,8 @@ int blk_throtl_init(struct request_queue *q)
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) {
- free_percpu(td->latency_buckets);
+ free_percpu(td->latency_buckets[READ]);
+ free_percpu(td->latency_buckets[WRITE]);
kfree(td);
}
return ret;
@@ -2429,7 +2460,8 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets);
+ free_percpu(q->td->latency_buckets[READ]);
+ free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
@@ -2447,8 +2479,10 @@ void blk_throtl_register_queue(struct request_queue *q)
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+ td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+ }
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
diff --git a/block/blk.h b/block/blk.h
index c84ae0e21ebd..b1771851ed92 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -162,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
+
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/elevator.c b/block/elevator.c
index 138faeb08a7c..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
struct elevator_queue *e = q->elevator;
int error;
+ lockdep_assert_held(&q->sysfs_lock);
+
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
}
return error;
}
-EXPORT_SYMBOL(elv_register_queue);
void elv_unregister_queue(struct request_queue *q)
{
+ lockdep_assert_held(&q->sysfs_lock);
+
if (q) {
struct elevator_queue *e = q->elevator;
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
wbt_enable_default(q);
}
}
-EXPORT_SYMBOL(elv_unregister_queue);
int elv_register(struct elevator_type *e)
{
@@ -967,6 +969,8 @@ static int elevator_switch_mq(struct request_queue *q,
{
int ret;
+ lockdep_assert_held(&q->sysfs_lock);
+
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -1012,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
bool old_registered = false;
int err;
+ lockdep_assert_held(&q->sysfs_lock);
+
if (q->mq_ops)
return elevator_switch_mq(q, new_e);
diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
}
/**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
* @parent: parent device for the disk
* @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
*
* This function registers the partitioning information in @disk
* with the kernel.
*
* FIXME: error handling
*/
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+ bool register_queue)
{
dev_t devt;
int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
exact_match, exact_lock, disk);
}
register_disk(parent, disk);
- blk_register_queue(disk);
+ if (register_queue)
+ blk_register_queue(disk);
/*
* Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
disk_add_events(disk);
blk_integrity_add(disk);
}
+
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+ __device_add_disk(parent, disk, true);
+}
EXPORT_SYMBOL(device_add_disk);
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+ __device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
* Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs).
*/
- bdi_unregister(disk->queue->backing_dev_info);
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ bdi_unregister(disk->queue->backing_dev_info);
blk_unregister_queue(disk);
} else {
WARN_ON(1);
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9220f8e833d0..c0ebda1283cc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -112,8 +112,7 @@ enum frame_flags {
struct frame {
struct list_head head;
u32 tag;
- struct timeval sent; /* high-res time packet was sent */
- u32 sent_jiffs; /* low-res jiffies-based sent time */
+ ktime_t sent; /* high-res time packet was sent */
ulong waited;
ulong waited_total;
struct aoetgt *t; /* parent target I belong to */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 812fed069708..540bb60cd071 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
static int
tsince_hr(struct frame *f)
{
- struct timeval now;
- int n;
+ u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
- do_gettimeofday(&now);
- n = now.tv_usec - f->sent.tv_usec;
- n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+ /* delta is normally under 4.2 seconds, avoid 64-bit division */
+ if (likely(delta <= UINT_MAX))
+ return (u32)delta / NSEC_PER_USEC;
- if (n < 0)
- n = -n;
+ /* avoid overflow after 71 minutes */
+ if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
+ return INT_MAX;
- /* For relatively long periods, use jiffies to avoid
- * discrepancies caused by updates to the system time.
- *
- * On system with HZ of 1000, 32-bits is over 49 days
- * worth of jiffies, or over 71 minutes worth of usecs.
- *
- * Jiffies overflow is handled by subtraction of unsigned ints:
- * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
- * $3 = 4
- * (gdb)
- */
- if (n > USEC_PER_SEC / 4) {
- n = ((u32) jiffies) - f->sent_jiffs;
- n *= USEC_PER_SEC / HZ;
- }
-
- return n;
+ return div_u64(delta, NSEC_PER_USEC);
}
static int
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
nf->waited = 0;
nf->waited_total = f->waited_total;
nf->sent = f->sent;
- nf->sent_jiffs = f->sent_jiffs;
f->skb = skb;
return nf;
@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
d->timer.function = rexmit_timer;
skb = skb_clone(skb, GFP_ATOMIC);
- if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
- }
+ if (skb)
+ f->sent = ktime_get();
return skb;
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b7d175e94a02 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
dm_complete_request(tio->orig, error);
}
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
blk_status_t r;
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
clone->start_time = jiffies;
r = blk_insert_cloned_request(clone->q, clone);
- if (r)
+ if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
/* must complete clone in terms of original request */
dm_complete_request(rq, r);
+ return r;
}
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
struct request *clone = NULL;
+ blk_status_t ret;
r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+check_again:
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
blk_rq_pos(rq));
- dm_dispatch_clone_request(clone, rq);
+ ret = dm_dispatch_clone_request(clone, rq);
+ if (ret == BLK_STS_RESOURCE) {
+ blk_rq_unprep_clone(clone);
+ tio->ti->type->release_clone_rq(clone);
+ tio->clone = NULL;
+ if (!rq->q->mq_ops)
+ r = DM_MAPIO_DELAY_REQUEUE;
+ else
+ r = DM_MAPIO_REQUEUE;
+ goto check_again;
+ }
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
return error;
}
- elv_register_queue(md->queue);
-
return 0;
}
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
}
dm_init_md_queue(md);
- /* backfill 'mq' sysfs registration normally done in blk_register_queue */
- err = blk_mq_register_dev(disk_to_dev(md->disk), q);
- if (err)
- goto out_cleanup_queue;
-
return 0;
-out_cleanup_queue:
- blk_cleanup_queue(q);
out_tag_set:
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7475739fee49..8c26bfc35335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1761,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
goto bad;
md->dax_dev = dax_dev;
- add_disk(md->disk);
+ add_disk_no_queue_reg(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@@ -2021,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
+ struct queue_limits limits;
enum dm_queue_mode type = dm_get_md_type(md);
switch (type) {
@@ -2057,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
break;
}
+ r = dm_calculate_queue_limits(t, &limits);
+ if (r) {
+ DMERR("Cannot calculate initial queue limits");
+ return r;
+ }
+ dm_table_set_restrictions(t, md->queue, &limits);
+ blk_register_queue(md->disk);
+
return 0;
}
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
struct blkg_rwstat *from)
{
- struct blkg_rwstat v = blkg_rwstat_read(from);
+ u64 sum[BLKG_RWSTAT_NR];
int i;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- atomic64_add(atomic64_read(&v.aux_cnt[i]) +
- atomic64_read(&from->aux_cnt[i]),
+ sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
&to->aux_cnt[i]);
}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
extern void elv_requeue_request(struct request_queue *, struct request *);
extern struct request *elv_former_request(struct request_queue *, struct request *);
extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_register_queue(struct request_queue *q);
-extern void elv_unregister_queue(struct request_queue *q);
extern int elv_may_queue(struct request_queue *, unsigned int);
extern void elv_completed_request(struct request_queue *, struct request *);
extern int elv_set_request(struct request_queue *q, struct request *rq,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
{
device_add_disk(NULL, disk);
}
+extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
+static inline void add_disk_no_queue_reg(struct gendisk *disk)
+{
+ device_add_disk_no_queue_reg(NULL, disk);
+}
extern void del_gendisk(struct gendisk *gp);
extern struct gendisk *get_gendisk(dev_t dev, int *partno);