diff options
-rw-r--r-- | block/bfq-iosched.c | 158 | ||||
-rw-r--r-- | block/bfq-iosched.h | 17 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 3 | ||||
-rw-r--r-- | block/bio-integrity.c | 1 | ||||
-rw-r--r-- | block/blk-core.c | 59 | ||||
-rw-r--r-- | block/blk-exec.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 12 | ||||
-rw-r--r-- | block/blk-map.c | 4 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 2 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 2 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk-mq.c | 158 | ||||
-rw-r--r-- | block/blk-mq.h | 3 | ||||
-rw-r--r-- | block/blk-sysfs.c | 47 | ||||
-rw-r--r-- | block/blk-throttle.c | 144 | ||||
-rw-r--r-- | block/blk.h | 3 | ||||
-rw-r--r-- | block/elevator.c | 10 | ||||
-rw-r--r-- | block/genhd.c | 23 | ||||
-rw-r--r-- | drivers/block/aoe/aoe.h | 3 | ||||
-rw-r--r-- | drivers/block/aoe/aoecmd.c | 48 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 28 | ||||
-rw-r--r-- | drivers/md/dm.c | 11 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 8 | ||||
-rw-r--r-- | include/linux/elevator.h | 2 | ||||
-rw-r--r-- | include/linux/genhd.h | 5 |
25 files changed, 559 insertions, 203 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index f352b1677143..47e6ec7427c4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -209,15 +209,17 @@ static struct kmem_cache *bfq_pool; * interactive applications automatically, using the following formula: * duration = (R / r) * T, where r is the peak rate of the device, and * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications. * * BFQ uses four different reference pairs (R, T), depending on: * . whether the device is rotational or non-rotational; @@ -254,6 +256,60 @@ static int T_slow[2]; static int T_fast[2]; static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -417,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ + bfqd->sb_shift = bt->sb.shift; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<<bfqd->sb_shift), instead of computing directly + * (1U<<(bfqd->sb_shift - something)), to be robust against + * any possible value of bfqd->sb_shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct sbitmap_queue *bt; + + if (op_is_sync(op) && !op_is_write(op)) + return; + + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return; + } + bt = &tags->breserved_tags; + } else + bt = &tags->bitmap_tags; + + if (unlikely(bfqd->sb_shift != bt->sb.shift)) + bfq_update_depths(bfqd, bt); + + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); +} + static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -1276,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, if (old_wr_coeff == 1 && wr_or_deserves_wr) { /* start a weight-raising period */ if (interactive) { + bfqq->service_from_wr = 0; bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { @@ -3589,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->entity.prio_changed = 1; } } + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && + bfqq->service_from_wr > max_service_from_wr) { + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + } } /* * To improve latency (for this or other queues), immediately @@ -5285,6 +5424,7 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { + .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 5d47b58d5fc8..350c39ae2896 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -337,6 +337,11 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Cumulative service received from the @bfq_queue since its + * last transition to weight-raised state. + */ + unsigned long service_from_wr; /* * Value of wr start time when switching to soft rt @@ -629,6 +634,18 @@ struct bfq_data { struct bfq_io_cq *bio_bic; /* bfqq associated with the task issuing current bio for merging */ struct bfq_queue *bio_bfqq; + + /* + * Cached sbitmap shift, used to compute depth limits in + * bfq_update_depths. + */ + unsigned int sb_shift; + + /* + * Depth limits used in bfq_limit_depth (see comments on the + * function) + */ + unsigned int word_depths[2][2]; }; enum bfqq_state_flags { diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4456eda34e48..4498c43245e2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -838,6 +838,9 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) if (!bfqq->service_from_backlogged) bfqq->first_IO_time = jiffies; + if (bfqq->wr_coeff > 1) + bfqq->service_from_wr += served; + bfqq->service_from_backlogged += served; for_each_entity(entity) { st = bfq_entity_service_tree(entity); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8aa03e..9cfdd6c83b5b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work) /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio - * @error: Pointer to errno * * Description: Completion for integrity I/O * diff --git a/block/blk-core.c b/block/blk-core.c index 7ba607527487..c21a16e9fdf9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2062,6 +2062,21 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ + if (part->policy && op_is_write(bio_op(bio))) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR + "generic_make_request: Trying to write " + "to read-only block-device %s (partno %d)\n", + bio_devname(bio, b), part->partno); + return true; + } + + return false; +} + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2070,27 +2085,28 @@ static inline int blk_partition_remap(struct bio *bio) struct hd_struct *p; int ret = 0; + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || + bio_check_ro(bio, p))) { + ret = -EIO; + goto out; + } + /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ - if (!bio->bi_partno || - (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) - return 0; + if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) + goto out; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_partno = 0; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); - } else { - printk("%s: fail for partition %d\n", __func__, bio->bi_partno); - ret = -EIO; - } - rcu_read_unlock(); + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); +out: + rcu_read_unlock(); return ret; } @@ -2149,15 +2165,19 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) goto end_io; - if (blk_partition_remap(bio)) - goto end_io; + if (!bio->bi_partno) { + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + } else { + if (blk_partition_remap(bio)) + goto end_io; + } if (bio_check_eod(bio, nr_sectors)) goto end_io; @@ -2500,8 +2520,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq, true); - return BLK_STS_OK; + return blk_mq_request_direct_issue(rq); } spin_lock_irqsave(q->queue_lock, flags); diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc446dc..f7b292f12449 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false, false); + blk_mq_sched_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544ce3d2e..a676084d4740 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secure_erase(q)) return -EOPNOTSUPP; @@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; @@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); @@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + while (nr_sects != 0) { bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); diff --git a/block/blk-map.c b/block/blk-map.c index b21f8e86f120..209eb3b45c54 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -114,7 +114,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; - int ret; + int ret = -EINVAL; if (!iter_is_iovec(iter)) goto fail; @@ -143,7 +143,7 @@ unmap_rq: __blk_rq_unmap_user(bio); fail: rq->bio = NULL; - return -EINVAL; + return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2ff7cf0cbf73..55c0a745b427 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,7 +427,7 @@ done: } void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block) + bool run_queue, bool async) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d1418a96d..1e9c9018ace1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block); + bool run_queue, bool async); void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3c234f..a54b4b070f1c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) q->mq_sysfs_init_done = false; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - mutex_lock(&q->sysfs_lock); - __blk_mq_unregister_dev(dev, q); - mutex_unlock(&q->sysfs_lock); -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); diff --git a/block/blk-mq.c b/block/blk-mq.c index c8f62e6be6b6..74a4f237ba91 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -745,13 +745,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false, true); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -1294,9 +1294,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * We should be running this queue from one of the CPUs that * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx */ - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } /* * We can't run the queue inline with ints disabled. Ensure that @@ -1319,21 +1337,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { int next_cpu; - +select_cpu: next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); - hctx->next_cpu = next_cpu; + /* + * No online CPU is found, so have to make sure hctx->next_cpu + * is set correctly for not breaking workqueue. + */ + if (next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); + else + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(hctx->next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } return hctx->next_cpu; } @@ -1694,9 +1738,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1705,15 +1749,52 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, }; blk_qc_t new_cookie; blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + __blk_mq_requeue_request(rq); + break; + default: + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass_insert) +{ + struct request_queue *q = rq->q; bool run_queue = true; - /* RCU or SRCU read lock is needed before checking quiesced flag */ + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_direct_issue(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; + bypass_insert = false; goto insert; } - if (q->elevator) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1724,42 +1805,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - new_cookie = request_to_qc_t(hctx, rq); - - /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done - */ - ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_STS_OK: - *cookie = new_cookie; - return; - case BLK_STS_RESOURCE: - __blk_mq_requeue_request(rq); - goto insert; - default: - *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, ret); - return; - } - + return __blk_mq_issue_directly(hctx, rq, cookie); insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, - hctx->flags & BLK_MQ_F_BLOCKING); + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { + blk_status_t ret; int srcu_idx; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); hctx_lock(hctx, &srcu_idx); - __blk_mq_try_issue_directly(hctx, rq, cookie); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); + if (ret == BLK_STS_RESOURCE) + blk_mq_sched_insert_request(rq, false, true, false); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_direct_issue(struct request *rq) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); hctx_unlock(hctx, srcu_idx); + + return ret; } static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) @@ -1870,7 +1956,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); diff --git a/block/blk-mq.h b/block/blk-mq.h index 8591a54d989b..e3ebc93646ca 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -74,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_direct_issue(struct request *rq); + /* * CPU -> queue mappings */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484eaed1f..cbea895a5547 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */ int blk_register_queue(struct gendisk *disk) { int ret; @@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk) if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); - goto unlock; + return ret; } } ret = 0; @@ -921,7 +926,15 @@ unlock: mutex_unlock(&q->sysfs_lock); return ret; } +EXPORT_SYMBOL_GPL(blk_register_queue); +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; - mutex_lock(&q->sysfs_lock); - queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); - mutex_unlock(&q->sysfs_lock); + /* Return early if disk->queue was never registered. */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return; - wbt_exit(q); + /* + * Since sysfs_remove_dir() prevents adding new directory entries + * before removal of existing entries starts, protect against + * concurrent elv_iosched_store() calls. + */ + mutex_lock(&q->sysfs_lock); + spin_lock_irq(q->queue_lock); + queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + spin_unlock_irq(q->queue_lock); + /* + * Remove the sysfs attributes before unregistering the queue data + * structures that can be modified through sysfs. + */ if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); - - if (q->request_fn || (q->mq_ops && q->elevator)) - elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + + wbt_exit(q); + + mutex_lock(&q->sysfs_lock); + if (q->request_fn || (q->mq_ops && q->elevator)) + elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); + kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 825bc29767e6..e136f5ef9577 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -216,9 +216,9 @@ struct throtl_data unsigned int scale; - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets; + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets[2]; unsigned long last_calculate_time; unsigned long filtered_latency; @@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = { .seq_show = blkg_print_stat_bytes, }, { + .name = "throttle.io_service_bytes_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes_recursive, + }, + { .name = "throttle.io_serviced", .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_ios, }, + { + .name = "throttle.io_serviced_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios_recursive, + }, { } /* terminate */ }; @@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; - int i, cpu; - unsigned long last_latency = 0; - unsigned long latency; + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; + int i, cpu, rw; + unsigned long last_latency[2] = { 0 }; + unsigned long latency[2]; if (!blk_queue_nonrot(td->queue)) return; @@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->last_calculate_time = jiffies; memset(avg_latency, 0, sizeof(avg_latency)); - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets, cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets[rw], + cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } - if (tmp->samples >= 32) { - int samples = tmp->samples; + if (tmp->samples >= 32) { + int samples = tmp->samples; - latency = tmp->total_latency; + latency[rw] = tmp->total_latency; - tmp->total_latency = 0; - tmp->samples = 0; - latency /= samples; - if (latency == 0) - continue; - avg_latency[i].latency = latency; + tmp->total_latency = 0; + tmp->samples = 0; + latency[rw] /= samples; + if (latency[rw] == 0) + continue; + avg_latency[rw][i].latency = latency[rw]; + } } } - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[i].latency) { - if (td->avg_buckets[i].latency < last_latency) - td->avg_buckets[i].latency = last_latency; - continue; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[rw][i].latency) { + if (td->avg_buckets[rw][i].latency < last_latency[rw]) + td->avg_buckets[rw][i].latency = + last_latency[rw]; + continue; + } - if (!td->avg_buckets[i].valid) - latency = avg_latency[i].latency; - else - latency = (td->avg_buckets[i].latency * 7 + - avg_latency[i].latency) >> 3; + if (!td->avg_buckets[rw][i].valid) + latency[rw] = avg_latency[rw][i].latency; + else + latency[rw] = (td->avg_buckets[rw][i].latency * 7 + + avg_latency[rw][i].latency) >> 3; - td->avg_buckets[i].latency = max(latency, last_latency); - td->avg_buckets[i].valid = true; - last_latency = td->avg_buckets[i].latency; + td->avg_buckets[rw][i].latency = max(latency[rw], + last_latency[rw]); + td->avg_buckets[rw][i].valid = true; + last_latency[rw] = td->avg_buckets[rw][i].latency; + } } for (i = 0; i < LATENCY_BUCKET_SIZE; i++) throtl_log(&td->service_queue, - "Latency bucket %d: latency=%ld, valid=%d", i, - td->avg_buckets[i].latency, td->avg_buckets[i].valid); + "Latency bucket %d: read latency=%ld, read valid=%d, " + "write latency=%ld, write valid=%d", i, + td->avg_buckets[READ][i].latency, + td->avg_buckets[READ][i].valid, + td->avg_buckets[WRITE][i].latency, + td->avg_buckets[WRITE][i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2248,16 +2269,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, struct latency_bucket *latency; int index; - if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + if (!td || td->limit_index != LIMIT_LOW || + !(op == REQ_OP_READ || op == REQ_OP_WRITE) || !blk_queue_nonrot(td->queue)) return; index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets); + latency = get_cpu_ptr(td->latency_buckets[op]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets); + put_cpu_ptr(td->latency_buckets[op]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2276,6 +2298,7 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long finish_time; unsigned long start_time; unsigned long lat; + int rw = bio_data_dir(bio); tg = bio->bi_cg_private; if (!tg) @@ -2304,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) bucket = request_bucket_index( blk_stat_size(&bio->bi_issue_stat)); - threshold = tg->td->avg_buckets[bucket].latency + + threshold = tg->td->avg_buckets[rw][bucket].latency + tg->latency_target; if (lat > threshold) tg->bad_bio_cnt++; @@ -2397,9 +2420,16 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets) { + if (!td->latency_buckets[READ]) { + kfree(td); + return -ENOMEM; + } + td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[WRITE]) { + free_percpu(td->latency_buckets[READ]); kfree(td); return -ENOMEM; } @@ -2418,7 +2448,8 @@ int blk_throtl_init(struct request_queue *q) /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); if (ret) { - free_percpu(td->latency_buckets); + free_percpu(td->latency_buckets[READ]); + free_percpu(td->latency_buckets[WRITE]); kfree(td); } return ret; @@ -2429,7 +2460,8 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets); + free_percpu(q->td->latency_buckets[READ]); + free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } @@ -2447,8 +2479,10 @@ void blk_throtl_register_queue(struct request_queue *q) } else { td->throtl_slice = DFL_THROTL_SLICE_HD; td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; + td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; + } } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ diff --git a/block/blk.h b/block/blk.h index c84ae0e21ebd..b1771851ed92 100644 --- a/block/blk.h +++ b/block/blk.h @@ -162,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +int elv_register_queue(struct request_queue *q); +void elv_unregister_queue(struct request_queue *q); + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); #ifdef CONFIG_FAIL_IO_TIMEOUT diff --git a/block/elevator.c b/block/elevator.c index 138faeb08a7c..e87e9b43aba0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q) struct elevator_queue *e = q->elevator; int error; + lockdep_assert_held(&q->sysfs_lock); + error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q) } return error; } -EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) { + lockdep_assert_held(&q->sysfs_lock); + if (q) { struct elevator_queue *e = q->elevator; @@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q) wbt_enable_default(q); } } -EXPORT_SYMBOL(elv_unregister_queue); int elv_register(struct elevator_type *e) { @@ -967,6 +969,8 @@ static int elevator_switch_mq(struct request_queue *q, { int ret; + lockdep_assert_held(&q->sysfs_lock); + blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -1012,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) bool old_registered = false; int err; + lockdep_assert_held(&q->sysfs_lock); + if (q->mq_ops) return elevator_switch_mq(q, new_e); diff --git a/block/genhd.c b/block/genhd.c index 96a66f671720..88a53c188cb7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -629,16 +629,18 @@ exit: } /** - * device_add_disk - add partitioning information to kernel list + * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -void device_add_disk(struct device *parent, struct gendisk *disk) +static void __device_add_disk(struct device *parent, struct gendisk *disk, + bool register_queue) { dev_t devt; int retval; @@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) exact_match, exact_lock, disk); } register_disk(parent, disk); - blk_register_queue(disk); + if (register_queue) + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk) disk_add_events(disk); blk_integrity_add(disk); } + +void device_add_disk(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, true); +} EXPORT_SYMBOL(device_add_disk); +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); } else { WARN_ON(1); diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 9220f8e833d0..c0ebda1283cc 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -112,8 +112,7 @@ enum frame_flags { struct frame { struct list_head head; u32 tag; - struct timeval sent; /* high-res time packet was sent */ - u32 sent_jiffs; /* low-res jiffies-based sent time */ + ktime_t sent; /* high-res time packet was sent */ ulong waited; ulong waited_total; struct aoetgt *t; /* parent target I belong to */ diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 812fed069708..540bb60cd071 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d) skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f) skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) return; - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f) static int tsince_hr(struct frame *f) { - struct timeval now; - int n; + u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent)); - do_gettimeofday(&now); - n = now.tv_usec - f->sent.tv_usec; - n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; + /* delta is normally under 4.2 seconds, avoid 64-bit division */ + if (likely(delta <= UINT_MAX)) + return (u32)delta / NSEC_PER_USEC; - if (n < 0) - n = -n; + /* avoid overflow after 71 minutes */ + if (delta > ((u64)INT_MAX * NSEC_PER_USEC)) + return INT_MAX; - /* For relatively long periods, use jiffies to avoid - * discrepancies caused by updates to the system time. - * - * On system with HZ of 1000, 32-bits is over 49 days - * worth of jiffies, or over 71 minutes worth of usecs. - * - * Jiffies overflow is handled by subtraction of unsigned ints: - * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe - * $3 = 4 - * (gdb) - */ - if (n > USEC_PER_SEC / 4) { - n = ((u32) jiffies) - f->sent_jiffs; - n *= USEC_PER_SEC / HZ; - } - - return n; + return div_u64(delta, NSEC_PER_USEC); } static int @@ -589,7 +571,6 @@ reassign_frame(struct frame *f) nf->waited = 0; nf->waited_total = f->waited_total; nf->sent = f->sent; - nf->sent_jiffs = f->sent_jiffs; f->skb = skb; return nf; @@ -633,8 +614,7 @@ probe(struct aoetgt *t) skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; + f->sent = ktime_get(); __skb_queue_head_init(&queue); __skb_queue_tail(&queue, skb); aoenet_xmit(&queue); @@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d) d->timer.function = rexmit_timer; skb = skb_clone(skb, GFP_ATOMIC); - if (skb) { - do_gettimeofday(&f->sent); - f->sent_jiffs = (u32) jiffies; - } + if (skb) + f->sent = ktime_get(); return skb; } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9d32f25489c2..b7d175e94a02 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error) dm_complete_request(tio->orig, error); } -static void dm_dispatch_clone_request(struct request *clone, struct request *rq) +static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq) { blk_status_t r; @@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); - if (r) + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); + return r; } static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, @@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio) struct mapped_device *md = tio->md; struct request *rq = tio->orig; struct request *clone = NULL; + blk_status_t ret; r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); +check_again: switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio) /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); - dm_dispatch_clone_request(clone, rq); + ret = dm_dispatch_clone_request(clone, rq); + if (ret == BLK_STS_RESOURCE) { + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); + tio->clone = NULL; + if (!rq->q->mq_ops) + r = DM_MAPIO_DELAY_REQUEUE; + else + r = DM_MAPIO_REQUEUE; + goto check_again; + } break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ @@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) return error; } - elv_register_queue(md->queue); - return 0; } @@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) } dm_init_md_queue(md); - /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - err = blk_mq_register_dev(disk_to_dev(md->disk), q); - if (err) - goto out_cleanup_queue; - return 0; -out_cleanup_queue: - blk_cleanup_queue(q); out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7475739fee49..8c26bfc35335 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1761,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor) goto bad; md->dax_dev = dax_dev; - add_disk(md->disk); + add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -2021,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; + struct queue_limits limits; enum dm_queue_mode type = dm_get_md_type(md); switch (type) { @@ -2057,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) break; } + r = dm_calculate_queue_limits(t, &limits); + if (r) { + DMERR("Cannot calculate initial queue limits"); + return r; + } + dm_table_set_restrictions(t, md->queue, &limits); + blk_register_queue(md->disk); + return 0; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e9825ff57b15..69bea82ebeb1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -660,12 +660,14 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, struct blkg_rwstat *from) { - struct blkg_rwstat v = blkg_rwstat_read(from); + u64 sum[BLKG_RWSTAT_NR]; int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(atomic64_read(&v.aux_cnt[i]) + - atomic64_read(&from->aux_cnt[i]), + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 3d794b3dc532..6d9e230dffd2 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -198,8 +198,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -extern int elv_register_queue(struct request_queue *q); -extern void elv_unregister_queue(struct request_queue *q); extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5144ebe046c9..5e3531027b51 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -395,6 +395,11 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk); } +extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); +static inline void add_disk_no_queue_reg(struct gendisk *disk) +{ + device_add_disk_no_queue_reg(NULL, disk); +} extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); |