From aa8d15bfe4ed23b97bf3990e2901844efc6347ec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 14 Jun 2016 17:03:13 +0200
Subject: block/partition-generic.c: Remove a set-but-not-used variable

A value is assigned to the variable 'info' but that value is never
used. Hence remove the variable 'info'.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partition-generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index d7eb77e1e3a8..71d9ed9df8da 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -495,7 +495,6 @@ rescan:
 	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size, from;
-		struct partition_meta_info *info = NULL;
 
 		size = state->parts[p].size;
 		if (!size)
@@ -530,8 +529,6 @@ rescan:
 			}
 		}
 
-		if (state->parts[p].has_info)
-			info = &state->parts[p].info;
 		part = add_partition(disk, p, from, size,
 				     state->parts[p].flags,
 				     &state->parts[p].info);
-- 
cgit 


From 1179a5a0851be6753b43601c00602dcef99e68f7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 14 Jun 2016 17:03:45 +0200
Subject: block/bio-integrity.c: Add #include "blk.h"

This patch avoids that building with W=1 C=2 triggers the following
warning:

block/bio-integrity.c:35:6: warning: symbol 'blk_flush_integrity' was not declared. Should it be static?

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 711e4d8de6fa..15d37b1cd500 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -26,6 +26,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
+#include "blk.h"
 
 #define BIP_INLINE_VECS	4
 
-- 
cgit 


From e1f3b9412edb22774cebf47e353b6ccc1b779cfe Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 14 Jun 2016 17:04:32 +0200
Subject: block/blk-cgroup.c: Declare local symbols static

Detected by sparse.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 66e6f1aae02e..dd38e5ced4a3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -905,7 +905,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 	return 0;
 }
 
-struct cftype blkcg_files[] = {
+static struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -914,7 +914,7 @@ struct cftype blkcg_files[] = {
 	{ }	/* terminate */
 };
 
-struct cftype blkcg_legacy_files[] = {
+static struct cftype blkcg_legacy_files[] = {
 	{
 		.name = "reset_stats",
 		.write_u64 = blkcg_reset_stats,
-- 
cgit 


From 59a37f8baeb2c9d97f316584c90892d18bf846d4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 16:58:26 +0200
Subject: blktrace: avoid using timespec

The blktrace code stores the current time in a 32-bit word in its
user interface. This is a bad idea because 32-bit seconds overflow
at some point.

We probably have until 2106 before this one overflows, as it seems
to use an 'unsigned' variable, but we should confirm that user
space treats it the same way.

Aside from this, we want to stop using 'struct timespec' here,
so I'm adding a comment about the overflow and change the code
to use timespec64 instead to make the loss of range more obvious.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 03b0dd98ff0e..bedb84d168d1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
 
 static void trace_note_time(struct blk_trace *bt)
 {
-	struct timespec now;
+	struct timespec64 now;
 	unsigned long flags;
 	u32 words[2];
 
-	getnstimeofday(&now);
-	words[0] = now.tv_sec;
+	/* need to check user space to see if this breaks in y2038 or y2106 */
+	ktime_get_real_ts64(&now);
+	words[0] = (u32)now.tv_sec;
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-- 
cgit 


From 9828c2c6c1048c61034a8b94e6376aeff6d2284f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jun 2016 09:03:59 +0200
Subject: block: Convert fifo_time from ulong to u64

Currently rq->fifo_time is unsigned long but CFQ stores nanosecond
timestamp in it which would overflow on 32-bit archs. Convert it to u64
to avoid the overflow. Since the rq->fifo_time is unioned with struct
call_single_data(), this does not change the size of struct request in
any way.

We have to slightly fixup block/deadline-iosched.c so that comparison
happens in the right types.

Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/deadline-iosched.c | 5 +++--
 include/linux/blkdev.h   | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index d0dd7882d8c7..26a9d3c8057a 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -173,7 +173,8 @@ deadline_merged_requests(struct request_queue *q, struct request *req,
 	 * and move into next position (next will be deleted) in fifo
 	 */
 	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-		if (time_before(next->fifo_time, req->fifo_time)) {
+		if (time_before((unsigned long)next->fifo_time,
+				(unsigned long)req->fifo_time)) {
 			list_move(&req->queuelist, &next->queuelist);
 			req->fifo_time = next->fifo_time;
 		}
@@ -227,7 +228,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 	/*
 	 * rq is expired!
 	 */
-	if (time_after_eq(jiffies, rq->fifo_time))
+	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
 		return 1;
 
 	return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9746d223494c..d116d3b52c73 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -90,7 +90,7 @@ struct request {
 	struct list_head queuelist;
 	union {
 		struct call_single_data csd;
-		unsigned long fifo_time;
+		u64 fifo_time;
 	};
 
 	struct request_queue *q;
-- 
cgit 


From 93fdf1478aaba6c397ba54f4cc534bf5019831b4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jun 2016 09:04:00 +0200
Subject: cfq-iosched: Convert slice_resid from u64 to s64

slice_resid can be both positive and negative. Commit 9a7f38c42c2b
(cfq-iosched: Convert from jiffies to nanoseconds) converted it from
long to u64. Although this did not introduce any functional regression
(the operations just overflow and the result was fine), it is certainly
wrong and could cause issues in future. So convert the type to more
appropriate s64.

Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d227ad633242..c1adbd01d0fa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -132,7 +132,7 @@ struct cfq_queue {
 	/* time when first request from queue completed and slice started. */
 	u64 slice_start;
 	u64 slice_end;
-	u64 slice_resid;
+	s64 slice_resid;
 
 	/* pending priority requests */
 	int prio_pending;
@@ -2689,7 +2689,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
 		else
 			cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
-		cfq_log_cfqq(cfqd, cfqq, "resid=%llu", cfqq->slice_resid);
+		cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
 	}
 
 	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
-- 
cgit 


From 149321a611d5d41cebcf5f813a3bf45b3afe66ad Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jun 2016 09:04:01 +0200
Subject: cfq-iosched: Fix regression in bonnie++ rewrite performance

Commit 9a7f38c42c2 (cfq-iosched: Convert from jiffies to nanoseconds)
broke the condition for detecting starved sync IO in
cfq_completed_request() because rq->start_time remained in jiffies but
we compared it with nanosecond values. This manifested as a regression
in bonnie++ rewrite performance because we always ended up considering
sync IO starved and thus never increased async IO queue depth.

Since rq->start_time is used in a lot of places, converting it to ns
values would be non-trivial. So just revert the condition in CFQ to use
comparison with jiffies. This will lead to suboptimal results if
cfq_fifo_expire[1] will ever come close to 1 jiffie but so far we are
relatively far from that with the storage used with CFQ (the default
value is 128 ms).

Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c1adbd01d0fa..2d03004dc77c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4243,7 +4243,16 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 					cfqq_type(cfqq));
 
 		st->ttime.last_end_request = now;
-		if (!(rq->start_time + cfqd->cfq_fifo_expire[1] > now))
+		/*
+		 * We have to do this check in jiffies since start_time is in
+		 * jiffies and it is not trivial to convert to ns. If
+		 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
+		 * will become problematic but so far we are fine (the default
+		 * is 128 ms).
+		 */
+		if (!time_after(rq->start_time +
+				  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
+				jiffies))
 			cfqd->last_delayed_sync = now;
 	}
 
-- 
cgit 


From 0b31c10c667b774e6d373c8f1146c93cff21a0cd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 28 Jun 2016 09:04:02 +0200
Subject: cfq-iosched: Charge at least 1 jiffie instead of 1 ns

Commit 9a7f38c42c2b (cfq-iosched: Convert from jiffies to nanoseconds)
could result in charging just 1 ns to a cgroup submitting IO instead of 1
jiffie we always charged before. It is arguable what is the right amount
to change but for now lets retain the old behavior of always charging at
least one jiffie.

Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2d03004dc77c..892afd6d40e2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1463,7 +1463,8 @@ static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
 		 * a single request on seeky media and cause lots of seek time
 		 * and group will never know it.
 		 */
-		slice_used = max_t(u64, (now - cfqq->dispatch_start), 1);
+		slice_used = max_t(u64, (now - cfqq->dispatch_start),
+					jiffies_to_nsecs(1));
 	} else {
 		slice_used = now - cfqq->slice_start;
 		if (slice_used > cfqq->allocated_slice) {
-- 
cgit 


From 141fd28cee8b0a4636e8f9c9e10d7d22bed3b108 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Wed, 29 Jun 2016 05:10:57 +0900
Subject: Doc: block: Fix a typo in queue-sysfs.txt

This patch fix a spelling typo found in queue-sysfs.txt.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index dce25d848d92..d515d58962b9 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -53,7 +53,7 @@ disk.
 
 logical_block_size (RO)
 -----------------------
-This is the logcal block size of the device, in bytes.
+This is the logical block size of the device, in bytes.
 
 max_hw_sectors_kb (RO)
 ----------------------
-- 
cgit 


From 9c8747167e204b3fe0ff143e1642c921e382e1d6 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Tue, 5 Jul 2016 13:38:32 -0300
Subject: block: atari: Return early for unsupported sector size

For 4K LBA or very large disks, atari_partition can easily get tricked
into thinking it has found an Atari partition table.  Depending on the
data in the disk, it ends up creating partitions with awkward lengths.

We saw logs like this while playing with fio.

[5.625867] nvme2n1: AHDI p2
[5.625872] nvme2n1: p2 size 2910030523 extends beyond EOD, truncated

People has had issues with misinterpreted AHDI partition tables for a long
time, see this BSD thread from 1995, for example.

https://mail-index.netbsd.org/port-atari/1995/11/19/0001.html

Since the atari partition, according to the spec, doesn't even support
sector sizes with more than 512, a quick sanity check is reasonable to
just bail out early, before even attempting to read sector 0.

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partitions/atari.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/partitions/atari.c b/block/partitions/atari.c
index 9875b05e80a2..ff1fb93712c1 100644
--- a/block/partitions/atari.c
+++ b/block/partitions/atari.c
@@ -42,6 +42,13 @@ int atari_partition(struct parsed_partitions *state)
 	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
 
+	/*
+	 * ATARI partition scheme supports 512 lba only.  If this is not
+	 * the case, bail early to avoid miscalculating hd_size.
+	 */
+	if (bdev_logical_block_size(state->bdev) != 512)
+		return 0;
+
 	rs = read_part_sector(state, 0, &sect);
 	if (!rs)
 		return -1;
-- 
cgit 


From df5c82a8dcca1a5f23fefebae973d2dd0bf5aa11 Mon Sep 17 00:00:00 2001
From: Vincent Stehlé <vincent.stehle@intel.com>
Date: Fri, 15 Jul 2016 17:03:21 +0200
Subject: Btrfs: fix comparison in __btrfs_map_block()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing comparison to op in expression, which was forgotten when doing
the REQ_OP transition.

Fixes: b3d3fa519905 ("btrfs: update __btrfs_map_block for REQ_OP transition")
Signed-off-by: Vincent Stehlé <vincent.stehle@intel.com>
Reviewed-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 14b2d19c842c..4ff9c06cb1be 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5494,7 +5494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op,
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (op == REQ_OP_WRITE || REQ_OP_DISCARD ||
+		if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
 		    op == REQ_GET_READ_MIRRORS) {
 			num_stripes = map->num_stripes;
 		} else if (mirror_num) {
-- 
cgit 


From 163d4baaebe39c0e56d9c08597eab7b3ae0bf334 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Thu, 23 Jun 2016 17:05:50 -0400
Subject: block: add QUEUE_FLAG_DAX for devices to advertise their DAX support

Currently, presence of direct_access() in block_device_operations
indicates support of DAX on its block device.  Because
block_device_operations is instantiated with 'const', this DAX
capablity may not be enabled conditinally.

In preparation for supporting DAX to device-mapper devices, add
QUEUE_FLAG_DAX to request_queue flags to advertise their DAX
support.  This will allow to set the DAX capability based on how
mapped device is composed.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <linux-s390@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c          | 4 +++-
 drivers/nvdimm/pmem.c        | 1 +
 drivers/s390/block/dcssblk.c | 1 +
 fs/block_dev.c               | 5 +++--
 include/linux/blkdev.h       | 2 ++
 5 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index f5b0d6f4e09f..dd96a935fba0 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -509,7 +509,9 @@ static struct brd_device *brd_alloc(int i)
 	blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
 	brd->brd_queue->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
-
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
+#endif
 	disk = brd->brd_disk = alloc_disk(max_part);
 	if (!disk)
 		goto out_free_queue;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 608fc4464574..53b701b2f73e 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -283,6 +283,7 @@ static int pmem_attach_disk(struct device *dev,
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
 	q->queuedata = pmem;
 
 	disk = alloc_disk_node(0, nid);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index bed53c46dd90..093e9e18e7e7 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -618,6 +618,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->driverfs_dev = &dev_info->dev;
 	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
 	blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, dev_info->dcssblk_queue);
 
 	seg_byte_size = (dev_info->end - dev_info->start + 1);
 	set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71ccab1d22c6..d012be4ab977 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -493,7 +493,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 
 	if (size < 0)
 		return size;
-	if (!ops->direct_access)
+	if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
 		return -EOPNOTSUPP;
 	if ((sector + DIV_ROUND_UP(size, 512)) >
 					part_nr_sects_read(bdev->bd_part))
@@ -1287,7 +1287,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) &&
+		    blk_queue_dax(disk->queue))
 			bdev->bd_inode->i_flags = S_DAX;
 		else
 			bdev->bd_inode->i_flags = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d116d3b52c73..9d84c98b5c79 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -505,6 +505,7 @@ struct request_queue {
 #define QUEUE_FLAG_WC	       23	/* Write back caching */
 #define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
+#define QUEUE_FLAG_DAX         26	/* device supports DAX */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -594,6 +595,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_dax(q)	test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
-- 
cgit 


From ea6ca600eb90350259cee5514e97b91ad1bc1aa5 Mon Sep 17 00:00:00 2001
From: Yigal Korman <yigal@plexistor.com>
Date: Thu, 23 Jun 2016 17:05:51 -0400
Subject: block: expose QUEUE_FLAG_DAX in sysfs

Provides the ability to identify DAX enabled devices in userspace.

Signed-off-by: Yigal Korman <yigal@plexistor.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 99205965f559..f87a7e747d36 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -379,6 +379,11 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_dax_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(blk_queue_dax(q), page);
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -516,6 +521,11 @@ static struct queue_sysfs_entry queue_wc_entry = {
 	.store = queue_wc_store,
 };
 
+static struct queue_sysfs_entry queue_dax_entry = {
+	.attr = {.name = "dax", .mode = S_IRUGO },
+	.show = queue_dax_show,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -542,6 +552,7 @@ static struct attribute *default_attrs[] = {
 	&queue_random_entry.attr,
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
+	&queue_dax_entry.attr,
 	NULL,
 };
 
-- 
cgit 


From 68bdf1ac2aa6318198adfe12407173bc60248b4e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 19 Jul 2016 08:18:06 -0700
Subject: block: Fix spelling in a source code comment

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3fec8a29d0fa..7ea04325d02f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -62,7 +62,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	/*
 	 * don't check dying flag for MQ because the request won't
-	 * be resued after dying flag is set
+	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
 		blk_mq_insert_request(rq, at_head, true, false);
-- 
cgit 


From 72ef799b3f14f4cb4c56ba3af6e6bdcbae6df368 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 7 Jul 2016 11:48:22 -0700
Subject: block: do not merge requests without consulting with io scheduler

Before merging a bio into an existing request, io scheduler is called to
get its approval first. However, the requests that come from a plug
flush may get merged by block layer without consulting with io
scheduler.

In case of CFQ, this can cause fairness problems. For instance, if a
request gets merged into a low weight cgroup's request, high weight cgroup
now will depend on low weight cgroup to get scheduled. If high weigt cgroup
needs that io request to complete before submitting more requests, then it
will also lose its timeslice.

Following script demonstrates the problem. Group g1 has a low weight, g2
and g3 have equal high weights but g2's requests are adjacent to g1's
requests so they are subject to merging. Due to these merges, g2 gets
poor disk time allocation.

cat > cfq-merge-repro.sh << "EOF"
#!/bin/bash
set -e

IO_ROOT=/mnt-cgroup/io

mkdir -p $IO_ROOT

if ! mount | grep -qw $IO_ROOT; then
  mount -t cgroup none -oblkio $IO_ROOT
fi

cd $IO_ROOT

for i in g1 g2 g3; do
  if [ -d $i ]; then
    rmdir $i
  fi
done

mkdir g1 && echo 10 > g1/blkio.weight
mkdir g2 && echo 495 > g2/blkio.weight
mkdir g3 && echo 495 > g3/blkio.weight

RUNTIME=10

(echo $BASHPID > g1/cgroup.procs &&
 fio --readonly --name name1 --filename /dev/sdb \
     --rw read --size 64k --bs 64k --time_based \
     --runtime=$RUNTIME --offset=0k &> /dev/null)&

(echo $BASHPID > g2/cgroup.procs &&
 fio --readonly --name name1 --filename /dev/sdb \
     --rw read --size 64k --bs 64k --time_based \
     --runtime=$RUNTIME --offset=64k &> /dev/null)&

(echo $BASHPID > g3/cgroup.procs &&
 fio --readonly --name name1 --filename /dev/sdb \
     --rw read --size 64k --bs 64k --time_based \
     --runtime=$RUNTIME --offset=256k &> /dev/null)&

sleep $((RUNTIME+1))

for i in g1 g2 g3; do
  echo ---- $i ----
  cat $i/blkio.time
done

EOF
# ./cfq-merge-repro.sh
---- g1 ----
8:16 162
---- g2 ----
8:16 165
---- g3 ----
8:16 686

After applying the patch:

# ./cfq-merge-repro.sh
---- g1 ----
8:16 90
---- g2 ----
8:16 445
---- g3 ----
8:16 471

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c        |  6 ++++++
 block/cfq-iosched.c      | 15 +++++++++++----
 block/deadline-iosched.c |  2 +-
 block/elevator.c         | 22 +++++++++++-----------
 include/linux/elevator.h | 11 ++++++++---
 5 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index c265348b75d1..e7c2fbc3f656 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -744,6 +744,12 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 			  struct request *next)
 {
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.elevator_allow_rq_merge_fn)
+		if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+			return 0;
+
 	return attempt_merge(q, rq, next);
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 892afd6d40e2..acabba198de9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2544,7 +2544,7 @@ static int cfq_merge(struct request_queue *q, struct request **req,
 	struct request *__rq;
 
 	__rq = cfq_find_rq_fmerge(cfqd, bio);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+	if (__rq && elv_bio_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
@@ -2601,8 +2601,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
-static int cfq_allow_merge(struct request_queue *q, struct request *rq,
-			   struct bio *bio)
+static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+			       struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_cq *cic;
@@ -2626,6 +2626,12 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	return cfqq == RQ_CFQQ(rq);
 }
 
+static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
+			      struct request *next)
+{
+	return RQ_CFQQ(rq) == RQ_CFQQ(next);
+}
+
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
@@ -4821,7 +4827,8 @@ static struct elevator_type iosched_cfq = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
-		.elevator_allow_merge_fn =	cfq_allow_merge,
+		.elevator_allow_bio_merge_fn =	cfq_allow_bio_merge,
+		.elevator_allow_rq_merge_fn =	cfq_allow_rq_merge,
 		.elevator_bio_merged_fn =	cfq_bio_merged,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 26a9d3c8057a..55e0bb6d7da7 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -137,7 +137,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		if (__rq) {
 			BUG_ON(sector != blk_rq_pos(__rq));
 
-			if (elv_rq_merge_ok(__rq, bio)) {
+			if (elv_bio_merge_ok(__rq, bio)) {
 				ret = ELEVATOR_FRONT_MERGE;
 				goto out;
 			}
diff --git a/block/elevator.c b/block/elevator.c
index ea9319db50d7..7096c22041e7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -53,13 +53,13 @@ static LIST_HEAD(elv_list);
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  */
-static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
+static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_merge_fn)
-		return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
+	if (e->type->ops.elevator_allow_bio_merge_fn)
+		return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -67,17 +67,17 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 /*
  * can we safely merge with this request?
  */
-bool elv_rq_merge_ok(struct request *rq, struct bio *bio)
+bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 {
 	if (!blk_rq_merge_ok(rq, bio))
-		return 0;
+		return false;
 
-	if (!elv_iosched_allow_merge(rq, bio))
-		return 0;
+	if (!elv_iosched_allow_bio_merge(rq, bio))
+		return false;
 
-	return 1;
+	return true;
 }
-EXPORT_SYMBOL(elv_rq_merge_ok);
+EXPORT_SYMBOL(elv_bio_merge_ok);
 
 static struct elevator_type *elevator_find(const char *name)
 {
@@ -425,7 +425,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	/*
 	 * First try one-hit cache.
 	 */
-	if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) {
+	if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
 		ret = blk_try_merge(q->last_merge, bio);
 		if (ret != ELEVATOR_NO_MERGE) {
 			*req = q->last_merge;
@@ -440,7 +440,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * See if our hash lookup can find a potential backmerge.
 	 */
 	__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+	if (__rq && elv_bio_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_BACK_MERGE;
 	}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 953d28647435..e7f358d2e5fc 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -16,7 +16,11 @@ typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *,
 
 typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int);
 
-typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
+typedef int (elevator_allow_bio_merge_fn) (struct request_queue *,
+					   struct request *, struct bio *);
+
+typedef int (elevator_allow_rq_merge_fn) (struct request_queue *,
+					  struct request *, struct request *);
 
 typedef void (elevator_bio_merged_fn) (struct request_queue *,
 						struct request *, struct bio *);
@@ -46,7 +50,8 @@ struct elevator_ops
 	elevator_merge_fn *elevator_merge_fn;
 	elevator_merged_fn *elevator_merged_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
-	elevator_allow_merge_fn *elevator_allow_merge_fn;
+	elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn;
+	elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn;
 	elevator_bio_merged_fn *elevator_bio_merged_fn;
 
 	elevator_dispatch_fn *elevator_dispatch_fn;
@@ -157,7 +162,7 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 extern int elevator_init(struct request_queue *, char *);
 extern void elevator_exit(struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
-extern bool elv_rq_merge_ok(struct request *, struct bio *);
+extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
 					struct elevator_type *);
 
-- 
cgit