25 files changed, 559 insertions, 203 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f352b1677143..47e6ec7427c4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -209,15 +209,17 @@ static struct kmem_cache *bfq_pool;
  * interactive applications automatically, using the following formula:
  * duration = (R / r) * T, where r is the peak rate of the device, and
  * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
+ * In particular, R is the peak rate of the reference device (see
+ * below), and T is a reference time: given the systems that are
+ * likely to be installed on the reference device according to its
+ * speed class, T is about the maximum time needed, under BFQ and
+ * while reading two files in parallel, to load typical large
+ * applications on these systems (see the comments on
+ * max_service_from_wr below, for more details on how T is obtained).
+ * In practice, the slower/faster the device at hand is, the more/less
+ * it takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to
+ * interactive applications.
  *
  * BFQ uses four different reference pairs (R, T), depending on:
  * . whether the device is rotational or non-rotational;
@@ -254,6 +256,60 @@ static int T_slow[2];
 static int T_fast[2];
 static int device_speed_thresh[2];
 
+/*
+ * BFQ uses the above-detailed, time-based weight-raising mechanism to
+ * privilege interactive tasks. This mechanism is vulnerable to the
+ * following false positives: I/O-bound applications that will go on
+ * doing I/O for much longer than the duration of weight
+ * raising. These applications have basically no benefit from being
+ * weight-raised at the beginning of their I/O. On the opposite end,
+ * while being weight-raised, these applications
+ * a) unjustly steal throughput to applications that may actually need
+ * low latency;
+ * b) make BFQ uselessly perform device idling; device idling results
+ * in loss of device throughput with most flash-based storage, and may
+ * increase latencies when used purposelessly.
+ *
+ * BFQ tries to reduce these problems, by adopting the following
+ * countermeasure. To introduce this countermeasure, we need first to
+ * finish explaining how the duration of weight-raising for
+ * interactive tasks is computed.
+ *
+ * For a bfq_queue deemed as interactive, the duration of weight
+ * raising is dynamically adjusted, as a function of the estimated
+ * peak rate of the device, so as to be equal to the time needed to
+ * execute the 'largest' interactive task we benchmarked so far. By
+ * largest task, we mean the task for which each involved process has
+ * to do more I/O than for any of the other tasks we benchmarked. This
+ * reference interactive task is the start-up of LibreOffice Writer,
+ * and in this task each process/bfq_queue needs to have at most ~110K
+ * sectors transferred.
+ *
+ * This last piece of information enables BFQ to reduce the actual
+ * duration of weight-raising for at least one class of I/O-bound
+ * applications: those doing sequential or quasi-sequential I/O. An
+ * example is file copy. In fact, once started, the main I/O-bound
+ * processes of these applications usually consume the above 110K
+ * sectors in much less time than the processes of an application that
+ * is starting, because these I/O-bound processes will greedily devote
+ * almost all their CPU cycles only to their target,
+ * throughput-friendly I/O operations. This is even more true if BFQ
+ * happens to be underestimating the device peak rate, and thus
+ * overestimating the duration of weight raising. But, according to
+ * our measurements, once transferred 110K sectors, these processes
+ * have no right to be weight-raised any longer.
+ *
+ * Basing on the last consideration, BFQ ends weight-raising for a
+ * bfq_queue if the latter happens to have received an amount of
+ * service at least equal to the following constant. The constant is
+ * set to slightly more than 110K, to have a minimum safety margin.
+ *
+ * This early ending of weight-raising reduces the amount of time
+ * during which interactive false positives cause the two problems
+ * described at the beginning of these comments.
+ */
+static const unsigned long max_service_from_wr = 120000;
+
 #define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])
 #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
 
@@ -417,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 	}
 }
 
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function.
+ */
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+{
+	bfqd->sb_shift = bt->sb.shift;
+
+	/*
+	 * In-word depths if no bfq_queue is being weight-raised:
+	 * leaving 25% of tags only for sync reads.
+	 *
+	 * In next formulas, right-shift the value
+	 * (1U<<bfqd->sb_shift), instead of computing directly
+	 * (1U<<(bfqd->sb_shift - something)), to be robust against
+	 * any possible value of bfqd->sb_shift, without having to
+	 * limit 'something'.
+	 */
+	/* no more than 50% of tags for async I/O */
+	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
+	/*
+	 * no more than 75% of tags for sync writes (25% extra tags
+	 * w.r.t. async I/O, to prevent async I/O from starving sync
+	 * writes)
+	 */
+	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
+
+	/*
+	 * In-word depths in case some bfq_queue is being weight-
+	 * raised: leaving ~63% of tags for sync reads. This is the
+	 * highest percentage for which, in our tests, application
+	 * start-up times didn't suffer from any regression due to tag
+	 * shortage.
+	 */
+	/* no more than ~18% of tags for async I/O */
+	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
+	/* no more than ~37% of tags for sync writes (~20% extra tags) */
+	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
+}
+
+/*
+ * Async I/O can easily starve sync I/O (both sync reads and sync
+ * writes), by consuming all tags. Similarly, storms of sync writes,
+ * such as those that sync(2) may trigger, can starve sync reads.
+ * Limit depths of async I/O and sync writes so as to counter both
+ * problems.
+ */
+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct bfq_data *bfqd = data->q->elevator->elevator_data;
+	struct sbitmap_queue *bt;
+
+	if (op_is_sync(op) && !op_is_write(op))
+		return;
+
+	if (data->flags & BLK_MQ_REQ_RESERVED) {
+		if (unlikely(!tags->nr_reserved_tags)) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+		bt = &tags->breserved_tags;
+	} else
+		bt = &tags->bitmap_tags;
+
+	if (unlikely(bfqd->sb_shift != bt->sb.shift))
+		bfq_update_depths(bfqd, bt);
+
+	data->shallow_depth =
+		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+
+	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+			__func__, bfqd->wr_busy_queues, op_is_sync(op),
+			data->shallow_depth);
+}
+
 static struct bfq_queue *
 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 		     sector_t sector, struct rb_node **ret_parent,
@@ -1276,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
 		/* start a weight-raising period */
 		if (interactive) {
+			bfqq->service_from_wr = 0;
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 		} else {
@@ -3589,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 				bfqq->entity.prio_changed = 1;
 			}
 		}
+		if (bfqq->wr_coeff > 1 &&
+		    bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
+		    bfqq->service_from_wr > max_service_from_wr) {
+			/* see comments on max_service_from_wr */
+			bfq_bfqq_end_wr(bfqq);
+		}
 	}
 	/*
 	 * To improve latency (for this or other queues), immediately
@@ -5285,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = {
 
 static struct elevator_type iosched_bfq_mq = {
 	.ops.mq = {
+		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
 		.finish_request		= bfq_finish_request,
 		.exit_icq		= bfq_exit_icq,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5d47b58d5fc8..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
 	 * last transition from idle to backlogged.
 	 */
 	unsigned long service_from_backlogged;
+	/*
+	 * Cumulative service received from the @bfq_queue since its
+	 * last transition to weight-raised state.
+	 */
+	unsigned long service_from_wr;
 
 	/*
 	 * Value of wr start time when switching to soft rt
@@ -629,6 +634,18 @@ struct bfq_data {
 	struct bfq_io_cq *bio_bic;
 	/* bfqq associated with the task issuing current bio for merging */
 	struct bfq_queue *bio_bfqq;
+
+	/*
+	 * Cached sbitmap shift, used to compute depth limits in
+	 * bfq_update_depths.
+	 */
+	unsigned int sb_shift;
+
+	/*
+	 * Depth limits used in bfq_limit_depth (see comments on the
+	 * function)
+	 */
+	unsigned int word_depths[2][2];
 };
 
 enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4456eda34e48..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -838,6 +838,9 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 	if (!bfqq->service_from_backlogged)
 		bfqq->first_IO_time = jiffies;
 
+	if (bfqq->wr_coeff > 1)
+		bfqq->service_from_wr += served;
+
 	bfqq->service_from_backlogged += served;
 	for_each_entity(entity) {
 		st = bfq_entity_service_tree(entity);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 23b42e8aa03e..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 /**
  * __bio_integrity_endio - Integrity I/O completion function
  * @bio:	Protected bio
- * @error:	Pointer to errno
  *
  * Description: Completion for integrity I/O
  *
diff --git a/block/blk-core.c b/block/blk-core.c
index 7ba607527487..c21a16e9fdf9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2062,6 +2062,21 @@ static inline bool should_fail_request(struct hd_struct *part,
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+{
+	if (part->policy && op_is_write(bio_op(bio))) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_ERR
+		       "generic_make_request: Trying to write "
+			"to read-only block-device %s (partno %d)\n",
+			bio_devname(bio, b), part->partno);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -2070,27 +2085,28 @@ static inline int blk_partition_remap(struct bio *bio)
 	struct hd_struct *p;
 	int ret = 0;
 
+	rcu_read_lock();
+	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+		     bio_check_ro(bio, p))) {
+		ret = -EIO;
+		goto out;
+	}
+
 	/*
 	 * Zone reset does not include bi_size so bio_sectors() is always 0.
 	 * Include a test for the reset op code and perform the remap if needed.
 	 */
-	if (!bio->bi_partno ||
-	    (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
-		return 0;
+	if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+		goto out;
 
-	rcu_read_lock();
-	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-	if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
-		bio->bi_iter.bi_sector += p->start_sect;
-		bio->bi_partno = 0;
-		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-				bio->bi_iter.bi_sector - p->start_sect);
-	} else {
-		printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
-		ret = -EIO;
-	}
-	rcu_read_unlock();
+	bio->bi_iter.bi_sector += p->start_sect;
+	bio->bi_partno = 0;
+	trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+			      bio->bi_iter.bi_sector - p->start_sect);
 
+out:
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -2149,15 +2165,19 @@ generic_make_request_checks(struct bio *bio)
 	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
 	 * if queue is not a request based queue.
 	 */
-
 	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
 		goto not_supported;
 
 	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
 		goto end_io;
 
-	if (blk_partition_remap(bio))
-		goto end_io;
+	if (!bio->bi_partno) {
+		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+			goto end_io;
+	} else {
+		if (blk_partition_remap(bio))
+			goto end_io;
+	}
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
@@ -2500,8 +2520,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		 * bypass a potential scheduler on the bottom device for
 		 * insert.
 		 */
-		blk_mq_request_bypass_insert(rq, true);
-		return BLK_STS_OK;
+		return blk_mq_request_direct_issue(rq);
 	}
 
 	spin_lock_irqsave(q->queue_lock, flags);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_sched_insert_request(rq, at_head, true, false, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false);
 		return;
 	}
 
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2bc544ce3d2e..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
 	if ((sector | nr_sects) & bs_mask)
 		return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
 	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
 
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	while (nr_sects != 0) {
 		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
 			       gfp_mask);
diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..209eb3b45c54 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -114,7 +114,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
 	struct bio *bio = NULL;
 	struct iov_iter i;
-	int ret;
+	int ret = -EINVAL;
 
 	if (!iter_is_iovec(iter))
 		goto fail;
@@ -143,7 +143,7 @@ unmap_rq:
 	__blk_rq_unmap_user(bio);
 fail:
 	rq->bio = NULL;
-	return -EINVAL;
+	return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2ff7cf0cbf73..55c0a745b427 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -427,7 +427,7 @@ done:
 }
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-				 bool run_queue, bool async, bool can_block)
+				 bool run_queue, bool async)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-				 bool run_queue, bool async, bool can_block);
+				 bool run_queue, bool async);
 void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	q->mq_sysfs_init_done = false;
 }
 
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
-	mutex_lock(&q->sysfs_lock);
-	__blk_mq_unregister_dev(dev, q);
-	mutex_unlock(&q->sysfs_lock);
-}
-
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 {
 	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c8f62e6be6b6..74a4f237ba91 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -745,13 +745,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_sched_insert_request(rq, true, false, false, true);
+		blk_mq_sched_insert_request(rq, true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_sched_insert_request(rq, false, false, false, true);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
 
 	blk_mq_run_hw_queues(q, false);
@@ -1294,9 +1294,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	/*
 	 * We should be running this queue from one of the CPUs that
 	 * are mapped to it.
+	 *
+	 * There are at least two related races now between setting
+	 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+	 * __blk_mq_run_hw_queue():
+	 *
+	 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+	 *   but later it becomes online, then this warning is harmless
+	 *   at all
+	 *
+	 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+	 *   but later it becomes offline, then the warning can't be
+	 *   triggered, and we depend on blk-mq timeout handler to
+	 *   handle dispatched requests to this hctx
 	 */
-	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-		cpu_online(hctx->next_cpu));
+	if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+		cpu_online(hctx->next_cpu)) {
+		printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+			raw_smp_processor_id(),
+			cpumask_empty(hctx->cpumask) ? "inactive": "active");
+		dump_stack();
+	}
 
 	/*
 	 * We can't run the queue inline with ints disabled. Ensure that
@@ -1319,21 +1337,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
+	bool tried = false;
+
 	if (hctx->queue->nr_hw_queues == 1)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
 		int next_cpu;
-
+select_cpu:
 		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
 				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
 			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
 
-		hctx->next_cpu = next_cpu;
+		/*
+		 * No online CPU is found, so have to make sure hctx->next_cpu
+		 * is set correctly for not breaking workqueue.
+		 */
+		if (next_cpu >= nr_cpu_ids)
+			hctx->next_cpu = cpumask_first(hctx->cpumask);
+		else
+			hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 
+	/*
+	 * Do unbound schedule if we can't find a online CPU for this hctx,
+	 * and it should only happen in the path of handling CPU DEAD.
+	 */
+	if (!cpu_online(hctx->next_cpu)) {
+		if (!tried) {
+			tried = true;
+			goto select_cpu;
+		}
+
+		/*
+		 * Make sure to re-select CPU next time once after CPUs
+		 * in hctx->cpumask become online again.
+		 */
+		hctx->next_cpu_batch = 1;
+		return WORK_CPU_UNBOUND;
+	}
 	return hctx->next_cpu;
 }
 
@@ -1694,9 +1738,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-					struct request *rq,
-					blk_qc_t *cookie)
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+					    struct request *rq,
+					    blk_qc_t *cookie)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
@@ -1705,15 +1749,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	};
 	blk_qc_t new_cookie;
 	blk_status_t ret;
+
+	new_cookie = request_to_qc_t(hctx, rq);
+
+	/*
+	 * For OK queue, we are done. For error, caller may kill it.
+	 * Any other error (busy), just add it to our list as we
+	 * previously would have done.
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	switch (ret) {
+	case BLK_STS_OK:
+		*cookie = new_cookie;
+		break;
+	case BLK_STS_RESOURCE:
+		__blk_mq_requeue_request(rq);
+		break;
+	default:
+		*cookie = BLK_QC_T_NONE;
+		break;
+	}
+
+	return ret;
+}
+
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+						struct request *rq,
+						blk_qc_t *cookie,
+						bool bypass_insert)
+{
+	struct request_queue *q = rq->q;
 	bool run_queue = true;
 
-	/* RCU or SRCU read lock is needed before checking quiesced flag */
+	/*
+	 * RCU or SRCU read lock is needed before checking quiesced flag.
+	 *
+	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
+	 * blk_mq_request_direct_issue(), and return BLK_STS_OK to caller,
+	 * and avoid driver to try to dispatch again.
+	 */
 	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
 		run_queue = false;
+		bypass_insert = false;
 		goto insert;
 	}
 
-	if (q->elevator)
+	if (q->elevator && !bypass_insert)
 		goto insert;
 
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
@@ -1724,42 +1805,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	new_cookie = request_to_qc_t(hctx, rq);
-
-	/*
-	 * For OK queue, we are done. For error, kill it. Any other
-	 * error (busy), just add it to our list as we previously
-	 * would have done
-	 */
-	ret = q->mq_ops->queue_rq(hctx, &bd);
-	switch (ret) {
-	case BLK_STS_OK:
-		*cookie = new_cookie;
-		return;
-	case BLK_STS_RESOURCE:
-		__blk_mq_requeue_request(rq);
-		goto insert;
-	default:
-		*cookie = BLK_QC_T_NONE;
-		blk_mq_end_request(rq, ret);
-		return;
-	}
-
+	return __blk_mq_issue_directly(hctx, rq, cookie);
 insert:
-	blk_mq_sched_insert_request(rq, false, run_queue, false,
-					hctx->flags & BLK_MQ_F_BLOCKING);
+	if (bypass_insert)
+		return BLK_STS_RESOURCE;
+
+	blk_mq_sched_insert_request(rq, false, run_queue, false);
+	return BLK_STS_OK;
 }
 
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
+	blk_status_t ret;
 	int srcu_idx;
 
 	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
 	hctx_lock(hctx, &srcu_idx);
-	__blk_mq_try_issue_directly(hctx, rq, cookie);
+
+	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+	if (ret == BLK_STS_RESOURCE)
+		blk_mq_sched_insert_request(rq, false, true, false);
+	else if (ret != BLK_STS_OK)
+		blk_mq_end_request(rq, ret);
+
+	hctx_unlock(hctx, srcu_idx);
+}
+
+blk_status_t blk_mq_request_direct_issue(struct request *rq)
+{
+	blk_status_t ret;
+	int srcu_idx;
+	blk_qc_t unused_cookie;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+	hctx_lock(hctx, &srcu_idx);
+	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
 	hctx_unlock(hctx, srcu_idx);
+
+	return ret;
 }
 
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1870,7 +1956,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	} else if (q->elevator) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_sched_insert_request(rq, false, true, true, true);
+		blk_mq_sched_insert_request(rq, false, true, true);
 	} else {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 8591a54d989b..e3ebc93646ca 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -74,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_direct_issue(struct request *rq);
+
 /*
  * CPU -> queue mappings
  */
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 870484eaed1f..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+/**
+ * blk_register_queue - register a block layer queue with sysfs
+ * @disk: Disk of which the request queue should be registered with sysfs.
+ */
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
 	if (q->request_fn || (q->mq_ops && q->elevator)) {
 		ret = elv_register_queue(q);
 		if (ret) {
+			mutex_unlock(&q->sysfs_lock);
 			kobject_uevent(&q->kobj, KOBJ_REMOVE);
 			kobject_del(&q->kobj);
 			blk_trace_remove_sysfs(dev);
 			kobject_put(&dev->kobj);
-			goto unlock;
+			return ret;
 		}
 	}
 	ret = 0;
@@ -921,7 +926,15 @@ unlock:
 	mutex_unlock(&q->sysfs_lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_register_queue);
 
+/**
+ * blk_unregister_queue - counterpart of blk_register_queue()
+ * @disk: Disk of which the request queue should be unregistered from sysfs.
+ *
+ * Note: the caller is responsible for guaranteeing that this function is called
+ * after blk_register_queue() has finished.
+ */
 void blk_unregister_queue(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
-	mutex_lock(&q->sysfs_lock);
-	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
-	mutex_unlock(&q->sysfs_lock);
+	/* Return early if disk->queue was never registered. */
+	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+		return;
 
-	wbt_exit(q);
+	/*
+	 * Since sysfs_remove_dir() prevents adding new directory entries
+	 * before removal of existing entries starts, protect against
+	 * concurrent elv_iosched_store() calls.
+	 */
+	mutex_lock(&q->sysfs_lock);
 
+	spin_lock_irq(q->queue_lock);
+	queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+	spin_unlock_irq(q->queue_lock);
 
+	/*
+	 * Remove the sysfs attributes before unregistering the queue data
+	 * structures that can be modified through sysfs.
+	 */
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
-
-	if (q->request_fn || (q->mq_ops && q->elevator))
-		elv_unregister_queue(q);
+	mutex_unlock(&q->sysfs_lock);
 
 	kobject_uevent(&q->kobj, KOBJ_REMOVE);
 	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
+
+	wbt_exit(q);
+
+	mutex_lock(&q->sysfs_lock);
+	if (q->request_fn || (q->mq_ops && q->elevator))
+		elv_unregister_queue(q);
+	mutex_unlock(&q->sysfs_lock);
+
 	kobject_put(&disk_to_dev(disk)->kobj);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..e136f5ef9577 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,9 +216,9 @@ struct throtl_data
 
 	unsigned int scale;
 
-	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-	struct latency_bucket __percpu *latency_buckets;
+	struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+	struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+	struct latency_bucket __percpu *latency_buckets[2];
 	unsigned long last_calculate_time;
 	unsigned long filtered_latency;
 
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
 		.seq_show = blkg_print_stat_bytes,
 	},
 	{
+		.name = "throttle.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
 		.name = "throttle.io_serviced",
 		.private = (unsigned long)&blkcg_policy_throtl,
 		.seq_show = blkg_print_stat_ios,
 	},
+	{
+		.name = "throttle.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
 	{ }	/* terminate */
 };
 
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_update_latency_buckets(struct throtl_data *td)
 {
-	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-	int i, cpu;
-	unsigned long last_latency = 0;
-	unsigned long latency;
+	struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+	int i, cpu, rw;
+	unsigned long last_latency[2] = { 0 };
+	unsigned long latency[2];
 
 	if (!blk_queue_nonrot(td->queue))
 		return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
 	td->last_calculate_time = jiffies;
 
 	memset(avg_latency, 0, sizeof(avg_latency));
-	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-		struct latency_bucket *tmp = &td->tmp_buckets[i];
-
-		for_each_possible_cpu(cpu) {
-			struct latency_bucket *bucket;
-
-			/* this isn't race free, but ok in practice */
-			bucket = per_cpu_ptr(td->latency_buckets, cpu);
-			tmp->total_latency += bucket[i].total_latency;
-			tmp->samples += bucket[i].samples;
-			bucket[i].total_latency = 0;
-			bucket[i].samples = 0;
-		}
+	for (rw = READ; rw <= WRITE; rw++) {
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+			for_each_possible_cpu(cpu) {
+				struct latency_bucket *bucket;
+
+				/* this isn't race free, but ok in practice */
+				bucket = per_cpu_ptr(td->latency_buckets[rw],
+					cpu);
+				tmp->total_latency += bucket[i].total_latency;
+				tmp->samples += bucket[i].samples;
+				bucket[i].total_latency = 0;
+				bucket[i].samples = 0;
+			}
 
-		if (tmp->samples >= 32) {
-			int samples = tmp->samples;
+			if (tmp->samples >= 32) {
+				int samples = tmp->samples;
 
-			latency = tmp->total_latency;
+				latency[rw] = tmp->total_latency;
 
-			tmp->total_latency = 0;
-			tmp->samples = 0;
-			latency /= samples;
-			if (latency == 0)
-				continue;
-			avg_latency[i].latency = latency;
+				tmp->total_latency = 0;
+				tmp->samples = 0;
+				latency[rw] /= samples;
+				if (latency[rw] == 0)
+					continue;
+				avg_latency[rw][i].latency = latency[rw];
+			}
 		}
 	}
 
-	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-		if (!avg_latency[i].latency) {
-			if (td->avg_buckets[i].latency < last_latency)
-				td->avg_buckets[i].latency = last_latency;
-			continue;
-		}
+	for (rw = READ; rw <= WRITE; rw++) {
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			if (!avg_latency[rw][i].latency) {
+				if (td->avg_buckets[rw][i].latency < last_latency[rw])
+					td->avg_buckets[rw][i].latency =
+						last_latency[rw];
+				continue;
+			}
 
-		if (!td->avg_buckets[i].valid)
-			latency = avg_latency[i].latency;
-		else
-			latency = (td->avg_buckets[i].latency * 7 +
-				avg_latency[i].latency) >> 3;
+			if (!td->avg_buckets[rw][i].valid)
+				latency[rw] = avg_latency[rw][i].latency;
+			else
+				latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+					avg_latency[rw][i].latency) >> 3;
 
-		td->avg_buckets[i].latency = max(latency, last_latency);
-		td->avg_buckets[i].valid = true;
-		last_latency = td->avg_buckets[i].latency;
+			td->avg_buckets[rw][i].latency = max(latency[rw],
+				last_latency[rw]);
+			td->avg_buckets[rw][i].valid = true;
+			last_latency[rw] = td->avg_buckets[rw][i].latency;
+		}
 	}
 
 	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
 		throtl_log(&td->service_queue,
-			"Latency bucket %d: latency=%ld, valid=%d", i,
-			td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+			"Latency bucket %d: read latency=%ld, read valid=%d, "
+			"write latency=%ld, write valid=%d", i,
+			td->avg_buckets[READ][i].latency,
+			td->avg_buckets[READ][i].valid,
+			td->avg_buckets[WRITE][i].latency,
+			td->avg_buckets[WRITE][i].valid);
 }
 #else
 static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2248,16 +2269,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
 	struct latency_bucket *latency;
 	int index;
 
-	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+	if (!td || td->limit_index != LIMIT_LOW ||
+	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
 	    !blk_queue_nonrot(td->queue))
 		return;
 
 	index = request_bucket_index(size);
 
-	latency = get_cpu_ptr(td->latency_buckets);
+	latency = get_cpu_ptr(td->latency_buckets[op]);
 	latency[index].total_latency += time;
 	latency[index].samples++;
-	put_cpu_ptr(td->latency_buckets);
+	put_cpu_ptr(td->latency_buckets[op]);
 }
 
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2276,6 +2298,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 	unsigned long finish_time;
 	unsigned long start_time;
 	unsigned long lat;
+	int rw = bio_data_dir(bio);
 
 	tg = bio->bi_cg_private;
 	if (!tg)
@@ -2304,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 		bucket = request_bucket_index(
 			blk_stat_size(&bio->bi_issue_stat));
-		threshold = tg->td->avg_buckets[bucket].latency +
+		threshold = tg->td->avg_buckets[rw][bucket].latency +
 			tg->latency_target;
 		if (lat > threshold)
 			tg->bad_bio_cnt++;
@@ -2397,9 +2420,16 @@ int blk_throtl_init(struct request_queue *q)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
-	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
 		LATENCY_BUCKET_SIZE, __alignof__(u64));
-	if (!td->latency_buckets) {
+	if (!td->latency_buckets[READ]) {
+		kfree(td);
+		return -ENOMEM;
+	}
+	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+		LATENCY_BUCKET_SIZE, __alignof__(u64));
+	if (!td->latency_buckets[WRITE]) {
+		free_percpu(td->latency_buckets[READ]);
 		kfree(td);
 		return -ENOMEM;
 	}
@@ -2418,7 +2448,8 @@ int blk_throtl_init(struct request_queue *q)
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret) {
-		free_percpu(td->latency_buckets);
+		free_percpu(td->latency_buckets[READ]);
+		free_percpu(td->latency_buckets[WRITE]);
 		kfree(td);
 	}
 	return ret;
@@ -2429,7 +2460,8 @@ void blk_throtl_exit(struct request_queue *q)
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-	free_percpu(q->td->latency_buckets);
+	free_percpu(q->td->latency_buckets[READ]);
+	free_percpu(q->td->latency_buckets[WRITE]);
 	kfree(q->td);
 }
 
@@ -2447,8 +2479,10 @@ void blk_throtl_register_queue(struct request_queue *q)
 	} else {
 		td->throtl_slice = DFL_THROTL_SLICE_HD;
 		td->filtered_latency = LATENCY_FILTERED_HD;
-		for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-			td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+		}
 	}
 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
 	/* if no low limit, use previous default */
diff --git a/block/blk.h b/block/blk.h
index c84ae0e21ebd..b1771851ed92 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -162,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
+
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/elevator.c b/block/elevator.c
index 138faeb08a7c..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
 	struct elevator_queue *e = q->elevator;
 	int error;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
 	if (!error) {
 		struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
 {
+	lockdep_assert_held(&q->sysfs_lock);
+
 	if (q) {
 		struct elevator_queue *e = q->elevator;
 
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
 		wbt_enable_default(q);
 	}
 }
-EXPORT_SYMBOL(elv_unregister_queue);
 
 int elv_register(struct elevator_type *e)
 {
@@ -967,6 +969,8 @@ static int elevator_switch_mq(struct request_queue *q,
 {
 	int ret;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
 
@@ -1012,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	bool old_registered = false;
 	int err;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	if (q->mq_ops)
 		return elevator_switch_mq(q, new_e);
 
diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
 }
 
 /**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
  * @parent: parent device for the disk
  * @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
  *
  * This function registers the partitioning information in @disk
  * with the kernel.
  *
  * FIXME: error handling
  */
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+			      bool register_queue)
 {
 	dev_t devt;
 	int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 				    exact_match, exact_lock, disk);
 	}
 	register_disk(parent, disk);
-	blk_register_queue(disk);
+	if (register_queue)
+		blk_register_queue(disk);
 
 	/*
 	 * Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
+
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, true);
+}
 EXPORT_SYMBOL(device_add_disk);
 
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+
 void del_gendisk(struct gendisk *disk)
 {
 	struct disk_part_iter piter;
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
 		 * Unregister bdi before releasing device numbers (as they can
 		 * get reused and we'd get clashes in sysfs).
 		 */
-		bdi_unregister(disk->queue->backing_dev_info);
+		if (!(disk->flags & GENHD_FL_HIDDEN))
+			bdi_unregister(disk->queue->backing_dev_info);
 		blk_unregister_queue(disk);
 	} else {
 		WARN_ON(1);
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9220f8e833d0..c0ebda1283cc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -112,8 +112,7 @@ enum frame_flags {
 struct frame {
 	struct list_head head;
 	u32 tag;
-	struct timeval sent;	/* high-res time packet was sent */
-	u32 sent_jiffs;		/* low-res jiffies-based sent time */
+	ktime_t sent;			/* high-res time packet was sent */
 	ulong waited;
 	ulong waited_total;
 	struct aoetgt *t;		/* parent target I belong to */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 812fed069708..540bb60cd071 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
 
 	skb = skb_clone(f->skb, GFP_ATOMIC);
 	if (skb) {
-		do_gettimeofday(&f->sent);
-		f->sent_jiffs = (u32) jiffies;
+		f->sent = ktime_get();
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, skb);
 		aoenet_xmit(&queue);
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return;
-	do_gettimeofday(&f->sent);
-	f->sent_jiffs = (u32) jiffies;
+	f->sent = ktime_get();
 	__skb_queue_head_init(&queue);
 	__skb_queue_tail(&queue, skb);
 	aoenet_xmit(&queue);
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
 static int
 tsince_hr(struct frame *f)
 {
-	struct timeval now;
-	int n;
+	u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
 
-	do_gettimeofday(&now);
-	n = now.tv_usec - f->sent.tv_usec;
-	n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+	/* delta is normally under 4.2 seconds, avoid 64-bit division */
+	if (likely(delta <= UINT_MAX))
+		return (u32)delta / NSEC_PER_USEC;
 
-	if (n < 0)
-		n = -n;
+	/* avoid overflow after 71 minutes */
+	if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
+		return INT_MAX;
 
-	/* For relatively long periods, use jiffies to avoid
-	 * discrepancies caused by updates to the system time.
-	 *
-	 * On system with HZ of 1000, 32-bits is over 49 days
-	 * worth of jiffies, or over 71 minutes worth of usecs.
-	 *
-	 * Jiffies overflow is handled by subtraction of unsigned ints:
-	 * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
-	 * $3 = 4
-	 * (gdb)
-	 */
-	if (n > USEC_PER_SEC / 4) {
-		n = ((u32) jiffies) - f->sent_jiffs;
-		n *= USEC_PER_SEC / HZ;
-	}
-
-	return n;
+	return div_u64(delta, NSEC_PER_USEC);
 }
 
 static int
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
 	nf->waited = 0;
 	nf->waited_total = f->waited_total;
 	nf->sent = f->sent;
-	nf->sent_jiffs = f->sent_jiffs;
 	f->skb = skb;
 
 	return nf;
@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
 
 	skb = skb_clone(f->skb, GFP_ATOMIC);
 	if (skb) {
-		do_gettimeofday(&f->sent);
-		f->sent_jiffs = (u32) jiffies;
+		f->sent = ktime_get();
 		__skb_queue_head_init(&queue);
 		__skb_queue_tail(&queue, skb);
 		aoenet_xmit(&queue);
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
 	d->timer.function = rexmit_timer;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
-	if (skb) {
-		do_gettimeofday(&f->sent);
-		f->sent_jiffs = (u32) jiffies;
-	}
+	if (skb)
+		f->sent = ktime_get();
 
 	return skb;
 }
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b7d175e94a02 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
 	dm_complete_request(tio->orig, error);
 }
 
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
 	blk_status_t r;
 
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 
 	clone->start_time = jiffies;
 	r = blk_insert_cloned_request(clone->q, clone);
-	if (r)
+	if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
 		/* must complete clone in terms of original request */
 		dm_complete_request(rq, r);
+	return r;
 }
 
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 	struct request *clone = NULL;
+	blk_status_t ret;
 
 	r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+check_again:
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
 		/* The target has remapped the I/O so dispatch it */
 		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(rq));
-		dm_dispatch_clone_request(clone, rq);
+		ret = dm_dispatch_clone_request(clone, rq);
+		if (ret == BLK_STS_RESOURCE) {
+			blk_rq_unprep_clone(clone);
+			tio->ti->type->release_clone_rq(clone);
+			tio->clone = NULL;
+			if (!rq->q->mq_ops)
+				r = DM_MAPIO_DELAY_REQUEUE;
+			else
+				r = DM_MAPIO_REQUEUE;
+			goto check_again;
+		}
 		break;
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
 		return error;
 	}
 
-	elv_register_queue(md->queue);
-
 	return 0;
 }
 
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	}
 	dm_init_md_queue(md);
 
-	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	err = blk_mq_register_dev(disk_to_dev(md->disk), q);
-	if (err)
-		goto out_cleanup_queue;
-
 	return 0;
 
-out_cleanup_queue:
-	blk_cleanup_queue(q);
 out_tag_set:
 	blk_mq_free_tag_set(md->tag_set);
 out_kfree_tag_set:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7475739fee49..8c26bfc35335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1761,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad;
 	md->dax_dev = dax_dev;
 
-	add_disk(md->disk);
+	add_disk_no_queue_reg(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
 	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@@ -2021,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
+	struct queue_limits limits;
 	enum dm_queue_mode type = dm_get_md_type(md);
 
 	switch (type) {
@@ -2057,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		break;
 	}
 
+	r = dm_calculate_queue_limits(t, &limits);
+	if (r) {
+		DMERR("Cannot calculate initial queue limits");
+		return r;
+	}
+	dm_table_set_restrictions(t, md->queue, &limits);
+	blk_register_queue(md->disk);
+
 	return 0;
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index e9825ff57b15..69bea82ebeb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 				       struct blkg_rwstat *from)
 {
-	struct blkg_rwstat v = blkg_rwstat_read(from);
+	u64 sum[BLKG_RWSTAT_NR];
 	int i;
 
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
-			     atomic64_read(&from->aux_cnt[i]),
+		sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
 			     &to->aux_cnt[i]);
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3d794b3dc532..6d9e230dffd2 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_register_queue(struct request_queue *q);
-extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5144ebe046c9..5e3531027b51 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk)
 {
 	device_add_disk(NULL, disk);
 }
+extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk);
+static inline void add_disk_no_queue_reg(struct gendisk *disk)
+{
+	device_add_disk_no_queue_reg(NULL, disk);
+}
 
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);