From aa8d15bfe4ed23b97bf3990e2901844efc6347ec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Jun 2016 17:03:13 +0200 Subject: block/partition-generic.c: Remove a set-but-not-used variable A value is assigned to the variable 'info' but that value is never used. Hence remove the variable 'info'. Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/partition-generic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index d7eb77e1e3a8..71d9ed9df8da 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -495,7 +495,6 @@ rescan: /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; - struct partition_meta_info *info = NULL; size = state->parts[p].size; if (!size) @@ -530,8 +529,6 @@ rescan: } } - if (state->parts[p].has_info) - info = &state->parts[p].info; part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); -- cgit From 1179a5a0851be6753b43601c00602dcef99e68f7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Jun 2016 17:03:45 +0200 Subject: block/bio-integrity.c: Add #include "blk.h" This patch avoids that building with W=1 C=2 triggers the following warning: block/bio-integrity.c:35:6: warning: symbol 'blk_flush_integrity' was not declared. Should it be static? Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 711e4d8de6fa..15d37b1cd500 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -26,6 +26,7 @@ #include #include #include +#include "blk.h" #define BIP_INLINE_VECS 4 -- cgit From e1f3b9412edb22774cebf47e353b6ccc1b779cfe Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Jun 2016 17:04:32 +0200 Subject: block/blk-cgroup.c: Declare local symbols static Detected by sparse. Signed-off-by: Bart Van Assche Cc: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 66e6f1aae02e..dd38e5ced4a3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -905,7 +905,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) return 0; } -struct cftype blkcg_files[] = { +static struct cftype blkcg_files[] = { { .name = "stat", .flags = CFTYPE_NOT_ON_ROOT, @@ -914,7 +914,7 @@ struct cftype blkcg_files[] = { { } /* terminate */ }; -struct cftype blkcg_legacy_files[] = { +static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, -- cgit From 59a37f8baeb2c9d97f316584c90892d18bf846d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 16:58:26 +0200 Subject: blktrace: avoid using timespec The blktrace code stores the current time in a 32-bit word in its user interface. This is a bad idea because 32-bit seconds overflow at some point. We probably have until 2106 before this one overflows, as it seems to use an 'unsigned' variable, but we should confirm that user space treats it the same way. Aside from this, we want to stop using 'struct timespec' here, so I'm adding a comment about the overflow and change the code to use timespec64 instead to make the loss of range more obvious. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 03b0dd98ff0e..bedb84d168d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) static void trace_note_time(struct blk_trace *bt) { - struct timespec now; + struct timespec64 now; unsigned long flags; u32 words[2]; - getnstimeofday(&now); - words[0] = now.tv_sec; + /* need to check user space to see if this breaks in y2038 or y2106 */ + ktime_get_real_ts64(&now); + words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); -- cgit From 9828c2c6c1048c61034a8b94e6376aeff6d2284f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jun 2016 09:03:59 +0200 Subject: block: Convert fifo_time from ulong to u64 Currently rq->fifo_time is unsigned long but CFQ stores nanosecond timestamp in it which would overflow on 32-bit archs. Convert it to u64 to avoid the overflow. Since the rq->fifo_time is unioned with struct call_single_data(), this does not change the size of struct request in any way. We have to slightly fixup block/deadline-iosched.c so that comparison happens in the right types. Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4 Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/deadline-iosched.c | 5 +++-- include/linux/blkdev.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index d0dd7882d8c7..26a9d3c8057a 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -173,7 +173,8 @@ deadline_merged_requests(struct request_queue *q, struct request *req, * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before(next->fifo_time, req->fifo_time)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } @@ -227,7 +228,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) /* * rq is expired! */ - if (time_after_eq(jiffies, rq->fifo_time)) + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) return 1; return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9746d223494c..d116d3b52c73 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -90,7 +90,7 @@ struct request { struct list_head queuelist; union { struct call_single_data csd; - unsigned long fifo_time; + u64 fifo_time; }; struct request_queue *q; -- cgit From 93fdf1478aaba6c397ba54f4cc534bf5019831b4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jun 2016 09:04:00 +0200 Subject: cfq-iosched: Convert slice_resid from u64 to s64 slice_resid can be both positive and negative. Commit 9a7f38c42c2b (cfq-iosched: Convert from jiffies to nanoseconds) converted it from long to u64. Although this did not introduce any functional regression (the operations just overflow and the result was fine), it is certainly wrong and could cause issues in future. So convert the type to more appropriate s64. Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4 Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d227ad633242..c1adbd01d0fa 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -132,7 +132,7 @@ struct cfq_queue { /* time when first request from queue completed and slice started. */ u64 slice_start; u64 slice_end; - u64 slice_resid; + s64 slice_resid; /* pending priority requests */ int prio_pending; @@ -2689,7 +2689,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); else cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); - cfq_log_cfqq(cfqd, cfqq, "resid=%llu", cfqq->slice_resid); + cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); } cfq_group_served(cfqd, cfqq->cfqg, cfqq); -- cgit From 149321a611d5d41cebcf5f813a3bf45b3afe66ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jun 2016 09:04:01 +0200 Subject: cfq-iosched: Fix regression in bonnie++ rewrite performance Commit 9a7f38c42c2 (cfq-iosched: Convert from jiffies to nanoseconds) broke the condition for detecting starved sync IO in cfq_completed_request() because rq->start_time remained in jiffies but we compared it with nanosecond values. This manifested as a regression in bonnie++ rewrite performance because we always ended up considering sync IO starved and thus never increased async IO queue depth. Since rq->start_time is used in a lot of places, converting it to ns values would be non-trivial. So just revert the condition in CFQ to use comparison with jiffies. This will lead to suboptimal results if cfq_fifo_expire[1] will ever come close to 1 jiffie but so far we are relatively far from that with the storage used with CFQ (the default value is 128 ms). Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4 Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c1adbd01d0fa..2d03004dc77c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4243,7 +4243,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq_type(cfqq)); st->ttime.last_end_request = now; - if (!(rq->start_time + cfqd->cfq_fifo_expire[1] > now)) + /* + * We have to do this check in jiffies since start_time is in + * jiffies and it is not trivial to convert to ns. If + * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test + * will become problematic but so far we are fine (the default + * is 128 ms). + */ + if (!time_after(rq->start_time + + nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]), + jiffies)) cfqd->last_delayed_sync = now; } -- cgit From 0b31c10c667b774e6d373c8f1146c93cff21a0cd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Jun 2016 09:04:02 +0200 Subject: cfq-iosched: Charge at least 1 jiffie instead of 1 ns Commit 9a7f38c42c2b (cfq-iosched: Convert from jiffies to nanoseconds) could result in charging just 1 ns to a cgroup submitting IO instead of 1 jiffie we always charged before. It is arguable what is the right amount to change but for now lets retain the old behavior of always charging at least one jiffie. Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4 Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2d03004dc77c..892afd6d40e2 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1463,7 +1463,8 @@ static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, * a single request on seeky media and cause lots of seek time * and group will never know it. */ - slice_used = max_t(u64, (now - cfqq->dispatch_start), 1); + slice_used = max_t(u64, (now - cfqq->dispatch_start), + jiffies_to_nsecs(1)); } else { slice_used = now - cfqq->slice_start; if (slice_used > cfqq->allocated_slice) { -- cgit From 141fd28cee8b0a4636e8f9c9e10d7d22bed3b108 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 29 Jun 2016 05:10:57 +0900 Subject: Doc: block: Fix a typo in queue-sysfs.txt This patch fix a spelling typo found in queue-sysfs.txt. Signed-off-by: Masanari Iida Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index dce25d848d92..d515d58962b9 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -53,7 +53,7 @@ disk. logical_block_size (RO) ----------------------- -This is the logcal block size of the device, in bytes. +This is the logical block size of the device, in bytes. max_hw_sectors_kb (RO) ---------------------- -- cgit From 9c8747167e204b3fe0ff143e1642c921e382e1d6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 5 Jul 2016 13:38:32 -0300 Subject: block: atari: Return early for unsupported sector size For 4K LBA or very large disks, atari_partition can easily get tricked into thinking it has found an Atari partition table. Depending on the data in the disk, it ends up creating partitions with awkward lengths. We saw logs like this while playing with fio. [5.625867] nvme2n1: AHDI p2 [5.625872] nvme2n1: p2 size 2910030523 extends beyond EOD, truncated People has had issues with misinterpreted AHDI partition tables for a long time, see this BSD thread from 1995, for example. https://mail-index.netbsd.org/port-atari/1995/11/19/0001.html Since the atari partition, according to the spec, doesn't even support sector sizes with more than 512, a quick sanity check is reasonable to just bail out early, before even attempting to read sector 0. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- block/partitions/atari.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 9875b05e80a2..ff1fb93712c1 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -42,6 +42,13 @@ int atari_partition(struct parsed_partitions *state) int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif + /* + * ATARI partition scheme supports 512 lba only. If this is not + * the case, bail early to avoid miscalculating hd_size. + */ + if (bdev_logical_block_size(state->bdev) != 512) + return 0; + rs = read_part_sector(state, 0, §); if (!rs) return -1; -- cgit From df5c82a8dcca1a5f23fefebae973d2dd0bf5aa11 Mon Sep 17 00:00:00 2001 From: Vincent StehlĂ© Date: Fri, 15 Jul 2016 17:03:21 +0200 Subject: Btrfs: fix comparison in __btrfs_map_block() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing comparison to op in expression, which was forgotten when doing the REQ_OP transition. Fixes: b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition") Signed-off-by: Vincent StehlĂ© Reviewed-by: Mike Christie Signed-off-by: Jens Axboe --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 14b2d19c842c..4ff9c06cb1be 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5494,7 +5494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == REQ_OP_WRITE || REQ_OP_DISCARD || + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || op == REQ_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { -- cgit From 163d4baaebe39c0e56d9c08597eab7b3ae0bf334 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 23 Jun 2016 17:05:50 -0400 Subject: block: add QUEUE_FLAG_DAX for devices to advertise their DAX support Currently, presence of direct_access() in block_device_operations indicates support of DAX on its block device. Because block_device_operations is instantiated with 'const', this DAX capablity may not be enabled conditinally. In preparation for supporting DAX to device-mapper devices, add QUEUE_FLAG_DAX to request_queue flags to advertise their DAX support. This will allow to set the DAX capability based on how mapped device is composed. Signed-off-by: Toshi Kani Acked-by: Dan Williams Signed-off-by: Mike Snitzer Cc: Jens Axboe Cc: Ross Zwisler Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Signed-off-by: Jens Axboe --- drivers/block/brd.c | 4 +++- drivers/nvdimm/pmem.c | 1 + drivers/s390/block/dcssblk.c | 1 + fs/block_dev.c | 5 +++-- include/linux/blkdev.h | 2 ++ 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index f5b0d6f4e09f..dd96a935fba0 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -509,7 +509,9 @@ static struct brd_device *brd_alloc(int i) blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX); brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - +#ifdef CONFIG_BLK_DEV_RAM_DAX + queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); +#endif disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 608fc4464574..53b701b2f73e 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -283,6 +283,7 @@ static int pmem_attach_disk(struct device *dev, blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); q->queuedata = pmem; disk = alloc_disk_node(0, nid); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index bed53c46dd90..093e9e18e7e7 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -618,6 +618,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->gd->driverfs_dev = &dev_info->dev; blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); + queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); seg_byte_size = (dev_info->end - dev_info->start + 1); set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors diff --git a/fs/block_dev.c b/fs/block_dev.c index 71ccab1d22c6..d012be4ab977 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -493,7 +493,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) if (size < 0) return size; - if (!ops->direct_access) + if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) return -EOPNOTSUPP; if ((sector + DIV_ROUND_UP(size, 512)) > part_nr_sects_read(bdev->bd_part)) @@ -1287,7 +1287,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && + blk_queue_dax(disk->queue)) bdev->bd_inode->i_flags = S_DAX; else bdev->bd_inode->i_flags = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d116d3b52c73..9d84c98b5c79 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -505,6 +505,7 @@ struct request_queue { #define QUEUE_FLAG_WC 23 /* Write back caching */ #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ +#define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -594,6 +595,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ -- cgit From ea6ca600eb90350259cee5514e97b91ad1bc1aa5 Mon Sep 17 00:00:00 2001 From: Yigal Korman Date: Thu, 23 Jun 2016 17:05:51 -0400 Subject: block: expose QUEUE_FLAG_DAX in sysfs Provides the ability to identify DAX enabled devices in userspace. Signed-off-by: Yigal Korman Signed-off-by: Toshi Kani Acked-by: Dan Williams Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 99205965f559..f87a7e747d36 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -379,6 +379,11 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return count; } +static ssize_t queue_dax_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_dax(q), page); +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -516,6 +521,11 @@ static struct queue_sysfs_entry queue_wc_entry = { .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_dax_entry = { + .attr = {.name = "dax", .mode = S_IRUGO }, + .show = queue_dax_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -542,6 +552,7 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_dax_entry.attr, NULL, }; -- cgit From 68bdf1ac2aa6318198adfe12407173bc60248b4e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Jul 2016 08:18:06 -0700 Subject: block: Fix spelling in a source code comment Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-exec.c b/block/blk-exec.c index 3fec8a29d0fa..7ea04325d02f 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -62,7 +62,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, /* * don't check dying flag for MQ because the request won't - * be resued after dying flag is set + * be reused after dying flag is set */ if (q->mq_ops) { blk_mq_insert_request(rq, at_head, true, false); -- cgit From 72ef799b3f14f4cb4c56ba3af6e6bdcbae6df368 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 7 Jul 2016 11:48:22 -0700 Subject: block: do not merge requests without consulting with io scheduler Before merging a bio into an existing request, io scheduler is called to get its approval first. However, the requests that come from a plug flush may get merged by block layer without consulting with io scheduler. In case of CFQ, this can cause fairness problems. For instance, if a request gets merged into a low weight cgroup's request, high weight cgroup now will depend on low weight cgroup to get scheduled. If high weigt cgroup needs that io request to complete before submitting more requests, then it will also lose its timeslice. Following script demonstrates the problem. Group g1 has a low weight, g2 and g3 have equal high weights but g2's requests are adjacent to g1's requests so they are subject to merging. Due to these merges, g2 gets poor disk time allocation. cat > cfq-merge-repro.sh << "EOF" #!/bin/bash set -e IO_ROOT=/mnt-cgroup/io mkdir -p $IO_ROOT if ! mount | grep -qw $IO_ROOT; then mount -t cgroup none -oblkio $IO_ROOT fi cd $IO_ROOT for i in g1 g2 g3; do if [ -d $i ]; then rmdir $i fi done mkdir g1 && echo 10 > g1/blkio.weight mkdir g2 && echo 495 > g2/blkio.weight mkdir g3 && echo 495 > g3/blkio.weight RUNTIME=10 (echo $BASHPID > g1/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=0k &> /dev/null)& (echo $BASHPID > g2/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=64k &> /dev/null)& (echo $BASHPID > g3/cgroup.procs && fio --readonly --name name1 --filename /dev/sdb \ --rw read --size 64k --bs 64k --time_based \ --runtime=$RUNTIME --offset=256k &> /dev/null)& sleep $((RUNTIME+1)) for i in g1 g2 g3; do echo ---- $i ---- cat $i/blkio.time done EOF # ./cfq-merge-repro.sh ---- g1 ---- 8:16 162 ---- g2 ---- 8:16 165 ---- g3 ---- 8:16 686 After applying the patch: # ./cfq-merge-repro.sh ---- g1 ---- 8:16 90 ---- g2 ---- 8:16 445 ---- g3 ---- 8:16 471 Signed-off-by: Tahsin Erdogan Signed-off-by: Jens Axboe --- block/blk-merge.c | 6 ++++++ block/cfq-iosched.c | 15 +++++++++++---- block/deadline-iosched.c | 2 +- block/elevator.c | 22 +++++++++++----------- include/linux/elevator.h | 11 ++++++++--- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index c265348b75d1..e7c2fbc3f656 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -744,6 +744,12 @@ int attempt_front_merge(struct request_queue *q, struct request *rq) int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next) { + struct elevator_queue *e = q->elevator; + + if (e->type->ops.elevator_allow_rq_merge_fn) + if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next)) + return 0; + return attempt_merge(q, rq, next); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 892afd6d40e2..acabba198de9 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2544,7 +2544,7 @@ static int cfq_merge(struct request_queue *q, struct request **req, struct request *__rq; __rq = cfq_find_rq_fmerge(cfqd, bio); - if (__rq && elv_rq_merge_ok(__rq, bio)) { + if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; } @@ -2601,8 +2601,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, cfq_del_cfqq_rr(cfqd, cfqq); } -static int cfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) +static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_cq *cic; @@ -2626,6 +2626,12 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, return cfqq == RQ_CFQQ(rq); } +static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return RQ_CFQQ(rq) == RQ_CFQQ(next); +} + static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) { hrtimer_try_to_cancel(&cfqd->idle_slice_timer); @@ -4821,7 +4827,8 @@ static struct elevator_type iosched_cfq = { .elevator_merge_fn = cfq_merge, .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, - .elevator_allow_merge_fn = cfq_allow_merge, + .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, + .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, .elevator_bio_merged_fn = cfq_bio_merged, .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 26a9d3c8057a..55e0bb6d7da7 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -137,7 +137,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); - if (elv_rq_merge_ok(__rq, bio)) { + if (elv_bio_merge_ok(__rq, bio)) { ret = ELEVATOR_FRONT_MERGE; goto out; } diff --git a/block/elevator.c b/block/elevator.c index ea9319db50d7..7096c22041e7 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -53,13 +53,13 @@ static LIST_HEAD(elv_list); * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ -static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) +static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->type->ops.elevator_allow_merge_fn) - return e->type->ops.elevator_allow_merge_fn(q, rq, bio); + if (e->type->ops.elevator_allow_bio_merge_fn) + return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio); return 1; } @@ -67,17 +67,17 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) /* * can we safely merge with this request? */ -bool elv_rq_merge_ok(struct request *rq, struct bio *bio) +bool elv_bio_merge_ok(struct request *rq, struct bio *bio) { if (!blk_rq_merge_ok(rq, bio)) - return 0; + return false; - if (!elv_iosched_allow_merge(rq, bio)) - return 0; + if (!elv_iosched_allow_bio_merge(rq, bio)) + return false; - return 1; + return true; } -EXPORT_SYMBOL(elv_rq_merge_ok); +EXPORT_SYMBOL(elv_bio_merge_ok); static struct elevator_type *elevator_find(const char *name) { @@ -425,7 +425,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) /* * First try one-hit cache. */ - if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { + if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; @@ -440,7 +440,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * See if our hash lookup can find a potential backmerge. */ __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); - if (__rq && elv_rq_merge_ok(__rq, bio)) { + if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_BACK_MERGE; } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 953d28647435..e7f358d2e5fc 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -16,7 +16,11 @@ typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int); -typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); +typedef int (elevator_allow_bio_merge_fn) (struct request_queue *, + struct request *, struct bio *); + +typedef int (elevator_allow_rq_merge_fn) (struct request_queue *, + struct request *, struct request *); typedef void (elevator_bio_merged_fn) (struct request_queue *, struct request *, struct bio *); @@ -46,7 +50,8 @@ struct elevator_ops elevator_merge_fn *elevator_merge_fn; elevator_merged_fn *elevator_merged_fn; elevator_merge_req_fn *elevator_merge_req_fn; - elevator_allow_merge_fn *elevator_allow_merge_fn; + elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn; + elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn; elevator_bio_merged_fn *elevator_bio_merged_fn; elevator_dispatch_fn *elevator_dispatch_fn; @@ -157,7 +162,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); extern void elevator_exit(struct elevator_queue *); extern int elevator_change(struct request_queue *, const char *); -extern bool elv_rq_merge_ok(struct request *, struct bio *); +extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *); -- cgit