From 51d638b1f56a0bfd9219800620994794a1a2b219 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Sun, 7 May 2017 00:14:22 -0700
Subject: [PATCH 01/10] block/mq: fix potential deadlock during cpu hotplug

This can be triggered by hot-unplug one cpu.

======================================================
 [ INFO: possible circular locking dependency detected ]
 4.11.0+ #17 Not tainted
 -------------------------------------------------------
 step_after_susp/2640 is trying to acquire lock:
  (all_q_mutex){+.+...}, at: [<ffffffffb33f95b8>] blk_mq_queue_reinit_work+0x18/0x110

 but task is already holding lock:
  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffffb306d04f>] cpu_hotplug_begin+0x7f/0xe0

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (cpu_hotplug.lock){+.+.+.}:
        lock_acquire+0x11c/0x230
        __mutex_lock+0x92/0x990
        mutex_lock_nested+0x1b/0x20
        get_online_cpus+0x64/0x80
        blk_mq_init_allocated_queue+0x3a0/0x4e0
        blk_mq_init_queue+0x3a/0x60
        loop_add+0xe5/0x280
        loop_init+0x124/0x177
        do_one_initcall+0x53/0x1c0
        kernel_init_freeable+0x1e3/0x27f
        kernel_init+0xe/0x100
        ret_from_fork+0x31/0x40

 -> #0 (all_q_mutex){+.+...}:
        __lock_acquire+0x189a/0x18a0
        lock_acquire+0x11c/0x230
        __mutex_lock+0x92/0x990
        mutex_lock_nested+0x1b/0x20
        blk_mq_queue_reinit_work+0x18/0x110
        blk_mq_queue_reinit_dead+0x1c/0x20
        cpuhp_invoke_callback+0x1f2/0x810
        cpuhp_down_callbacks+0x42/0x80
        _cpu_down+0xb2/0xe0
        freeze_secondary_cpus+0xb6/0x390
        suspend_devices_and_enter+0x3b3/0xa40
        pm_suspend+0x129/0x490
        state_store+0x82/0xf0
        kobj_attr_store+0xf/0x20
        sysfs_kf_write+0x45/0x60
        kernfs_fop_write+0x135/0x1c0
        __vfs_write+0x37/0x160
        vfs_write+0xcd/0x1d0
        SyS_write+0x58/0xc0
        do_syscall_64+0x8f/0x710
        return_from_SYSCALL_64+0x0/0x7a

 other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(cpu_hotplug.lock);
                                lock(all_q_mutex);
                                lock(cpu_hotplug.lock);
   lock(all_q_mutex);

  *** DEADLOCK ***

 8 locks held by step_after_susp/2640:
  #0:  (sb_writers#6){.+.+.+}, at: [<ffffffffb3244aed>] vfs_write+0x1ad/0x1d0
  #1:  (&of->mutex){+.+.+.}, at: [<ffffffffb32d3a51>] kernfs_fop_write+0x101/0x1c0
  #2:  (s_active#166){.+.+.+}, at: [<ffffffffb32d3a59>] kernfs_fop_write+0x109/0x1c0
  #3:  (pm_mutex){+.+...}, at: [<ffffffffb30d2ecd>] pm_suspend+0x21d/0x490
  #4:  (acpi_scan_lock){+.+.+.}, at: [<ffffffffb34dc3d7>] acpi_scan_lock_acquire+0x17/0x20
  #5:  (cpu_add_remove_lock){+.+.+.}, at: [<ffffffffb306d6d7>] freeze_secondary_cpus+0x27/0x390
  #6:  (cpu_hotplug.dep_map){++++++}, at: [<ffffffffb306cfd5>] cpu_hotplug_begin+0x5/0xe0
  #7:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffffb306d04f>] cpu_hotplug_begin+0x7f/0xe0

 stack backtrace:
 CPU: 3 PID: 2640 Comm: step_after_susp Not tainted 4.11.0+ #17
 Hardware name: Dell Inc. OptiPlex 7040/0JCTF8, BIOS 1.4.9 09/12/2016
 Call Trace:
  dump_stack+0x99/0xce
  print_circular_bug+0x1fa/0x270
  __lock_acquire+0x189a/0x18a0
  lock_acquire+0x11c/0x230
  ? lock_acquire+0x11c/0x230
  ? blk_mq_queue_reinit_work+0x18/0x110
  ? blk_mq_queue_reinit_work+0x18/0x110
  __mutex_lock+0x92/0x990
  ? blk_mq_queue_reinit_work+0x18/0x110
  ? kmem_cache_free+0x2cb/0x330
  ? anon_transport_class_unregister+0x20/0x20
  ? blk_mq_queue_reinit_work+0x110/0x110
  mutex_lock_nested+0x1b/0x20
  ? mutex_lock_nested+0x1b/0x20
  blk_mq_queue_reinit_work+0x18/0x110
  blk_mq_queue_reinit_dead+0x1c/0x20
  cpuhp_invoke_callback+0x1f2/0x810
  ? __flow_cache_shrink+0x160/0x160
  cpuhp_down_callbacks+0x42/0x80
  _cpu_down+0xb2/0xe0
  freeze_secondary_cpus+0xb6/0x390
  suspend_devices_and_enter+0x3b3/0xa40
  ? rcu_read_lock_sched_held+0x79/0x80
  pm_suspend+0x129/0x490
  state_store+0x82/0xf0
  kobj_attr_store+0xf/0x20
  sysfs_kf_write+0x45/0x60
  kernfs_fop_write+0x135/0x1c0
  __vfs_write+0x37/0x160
  ? rcu_read_lock_sched_held+0x79/0x80
  ? rcu_sync_lockdep_assert+0x2f/0x60
  ? __sb_start_write+0xd9/0x1c0
  ? vfs_write+0x1ad/0x1d0
  vfs_write+0xcd/0x1d0
  SyS_write+0x58/0xc0
  ? rcu_read_lock_sched_held+0x79/0x80
  do_syscall_64+0x8f/0x710
  ? trace_hardirqs_on_thunk+0x1a/0x1c
  entry_SYSCALL64_slow_path+0x25/0x25

The cpu hotplug path will hold cpu_hotplug.lock and then reinit all exiting
queues for blk mq w/ all_q_mutex, however, blk_mq_init_allocated_queue() will
contend these two locks in the inversion order. This is due to commit eabe06595d62
(blk/mq: Cure cpu hotplug lock inversion), it fixes a cpu hotplug lock inversion
issue because of hotplug rework, however the hotplug rework is still work-in-progress
and lives in a -tip branch and mainline cannot yet trigger that splat. The commit
breaks the linus's tree in the merge window, so this patch reverts the lock order
and avoids to splat linus's tree.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5d4ce7eb8dbf..a7e6b353e4e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2341,15 +2341,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-	mutex_lock(&all_q_mutex);
 	get_online_cpus();
+	mutex_lock(&all_q_mutex);
 
 	list_add_tail(&q->all_q_node, &all_q_list);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q, cpu_online_mask);
 
-	put_online_cpus();
 	mutex_unlock(&all_q_mutex);
+	put_online_cpus();
 
 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
 		int ret;

From 629b1b2e0e6c8285fdc21624af60e8190fa4417e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 7 May 2017 16:14:44 +0200
Subject: [PATCH 02/10] lightnvm: remove unused rq parameter of
 nvme_nvm_rqtocmd() to kill warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc 4.1.2:

    drivers/nvme/host/lightnvm.c: In function ‘nvme_nvm_submit_io’:
    drivers/nvme/host/lightnvm.c:498: warning: ‘rq’ is used uninitialized in this function

Indeed, since commit 2e13f33a2464fc3a ("lightnvm: create cmd before
allocating request"), the request is passed to nvme_nvm_rqtocmd() before
it is allocated.

Fortunately, as of commit 91276162de9476b8 ("lightnvm: refactor end_io
functions for sync"), nvme_nvm_rqtocmd () no longer uses the passed
request, so this warning is a false positive.

Drop the unused parameter to clean up the code and kill the warning.

Fixes: 2e13f33a2464fc3a ("lightnvm: create cmd before allocating request")
Fixes: 91276162de9476b8 ("lightnvm: refactor end_io functions for sync")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 8c4adac6fafc..206bfdbc1c98 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -464,8 +464,8 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
 	return ret;
 }
 
-static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
-				struct nvme_ns *ns, struct nvme_nvm_command *c)
+static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
+				    struct nvme_nvm_command *c)
 {
 	c->ph_rw.opcode = rqd->opcode;
 	c->ph_rw.nsid = cpu_to_le32(ns->ns_id);
@@ -503,7 +503,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	if (!cmd)
 		return -ENOMEM;
 
-	nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
+	nvme_nvm_rqtocmd(rqd, ns, cmd);
 
 	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
 	if (IS_ERR(rq)) {

From ebd768579552a10682c2cbdfa657779dea62a01d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 8 May 2017 11:44:40 +0100
Subject: [PATCH 03/10] blk-mq: make __blk_mq_stop_hw_queues static

Making __blk_mq_stop_hw_queues static fixes sparse warning:

  block/blk-mq.c:6: warning: symbol '__blk_mq_stop_hw_queues' was not
  declared. Should it be static?

Fixes: 2719aa217e0d0 ("blk-mq: don't use sync workqueue flushing from drivers")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7e6b353e4e9..9ae1d9ccf4df 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1236,7 +1236,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
-void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
+static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;

From fba704b494fdc6816a039a66887274b4e5c00eeb Mon Sep 17 00:00:00 2001
From: Rakesh Pandit <rakesh@tuxera.com>
Date: Tue, 9 May 2017 08:48:25 -0600
Subject: [PATCH 04/10] nvme: lightnvm: fix memory leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Free up kmalloc allocated memory if failure happens while handling L2P
table transfer in nvme_nvm_get_l2p_tbl.

Fixes: 8e79b5cb ("lightnvm: move block provisioning to targets")
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 206bfdbc1c98..f5df78ed1e10 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -367,7 +367,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 
 		if (unlikely(elba > nvmdev->total_secs)) {
 			pr_err("nvm: L2P data from device is out of bounds!\n");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out;
 		}
 
 		/* Transform physical address to target address space */

From a66c38a171ed25488debf80247a9e72e1026e82c Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Tue, 9 May 2017 11:37:27 +0200
Subject: [PATCH 05/10] block, bfq: use pointer entity->sched_data only if set

In the function __bfq_deactivate_entity, the pointer
entity->sched_data could happen to be used before being properly
initialized. This led to a NULL pointer dereference. This commit fixes
this bug by just using this pointer only where it is safe to do so.

Reported-by: Tom Harrison <l12436.tw@gmail.com>
Tested-by: Tom Harrison <l12436.tw@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bfq-wf2q.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index b4fc3e4260b7..8726ede19eef 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1114,12 +1114,21 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
 bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	int is_in_service = entity == sd->in_service_entity;
+	struct bfq_service_tree *st;
+	bool is_in_service;
 
 	if (!entity->on_st) /* entity never activated, or already inactive */
 		return false;
 
+	/*
+	 * If we get here, then entity is active, which implies that
+	 * bfq_group_set_parent has already been invoked for the group
+	 * represented by entity. Therefore, the field
+	 * entity->sched_data has been set, and we can safely use it.
+	 */
+	st = bfq_entity_service_tree(entity);
+	is_in_service = entity == sd->in_service_entity;
+
 	if (is_in_service)
 		bfq_calc_finish(entity, entity->service);
 

From 43c1b3d6e536b7b21ed329ae54280d8ba308ba92 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Tue, 9 May 2017 12:54:23 +0200
Subject: [PATCH 06/10] block, bfq: stress that low_latency must be off to get
 max throughput

The introduction of the BFQ and Kyber I/O schedulers has triggered a
new wave of I/O benchmarks. Unfortunately, comments and discussions on
these benchmarks confirm that there is still little awareness that it
is very hard to achieve, at the same time, a low latency and a high
throughput. In particular, virtually all benchmarks measure
throughput, or throughput-related figures of merit, but, for BFQ, they
use the scheduler in its default configuration. This configuration is
geared, instead, toward a low latency. This is evidently a sign that
BFQ documentation is still too unclear on this important aspect. This
commit addresses this issue by stressing how BFQ configuration must be
(easily) changed if the only goal is maximum throughput.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/bfq-iosched.txt | 17 ++++++++++++++++-
 block/bfq-iosched.c                 |  5 +++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 1b87df6cd476..05e2822a80b3 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -11,6 +11,13 @@ controllers), BFQ's main features are:
   groups (switching back to time distribution when needed to keep
   throughput high).
 
+In its default configuration, BFQ privileges latency over
+throughput. So, when needed for achieving a lower latency, BFQ builds
+schedules that may lead to a lower throughput. If your main or only
+goal, for a given device, is to achieve the maximum-possible
+throughput at all times, then do switch off all low-latency heuristics
+for that device, by setting low_latency to 0. Full details in Section 3.
+
 On average CPUs, the current version of BFQ can handle devices
 performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
 reference, 30-50 KIOPS correspond to very high bandwidths with
@@ -375,11 +382,19 @@ default, low latency mode is enabled. If enabled, interactive and soft
 real-time applications are privileged and experience a lower latency,
 as explained in more detail in the description of how BFQ works.
 
-DO NOT enable this mode if you need full control on bandwidth
+DISABLE this mode if you need full control on bandwidth
 distribution. In fact, if it is enabled, then BFQ automatically
 increases the bandwidth share of privileged applications, as the main
 means to guarantee a lower latency to them.
 
+In addition, as already highlighted at the beginning of this document,
+DISABLE this mode if your only goal is to achieve a high throughput.
+In fact, privileging the I/O of some application over the rest may
+entail a lower throughput. To achieve the highest-possible throughput
+on a non-rotational device, setting slice_idle to 0 may be needed too
+(at the cost of giving up any strong guarantee on fairness and low
+latency).
+
 timeout_sync
 ------------
 
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index bd8499ef157c..08ce45096350 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -56,6 +56,11 @@
  * rotational or flash-based devices, and to get the job done quickly
  * for applications consisting in many I/O-bound processes.
  *
+ * NOTE: if the main or only goal, with a given device, is to achieve
+ * the maximum-possible throughput at all times, then do switch off
+ * all low-latency heuristics for that device, by setting low_latency
+ * to 0.
+ *
  * BFQ is described in [1], where also a reference to the initial, more
  * theoretical paper on BFQ can be found. The interested reader can find
  * in the latter paper full details on the main algorithm, as well as

From 340ff3216799a947fe0b07bed8f0409ffc716be9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 May 2017 07:40:04 -0600
Subject: [PATCH 07/10] elevator: remove redundant warnings on IO scheduler
 switch

We warn twice for switching to a scheduler, if that switch fails.
As we also report the failure in the return value to the
sysfs write, remove the dmesg induced failures.

Keep the failure print for warning to switch to the kconfig
selected IO scheduler, as we can't report errors for that in
any other way.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/elevator.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index ab726a5c0bf6..dac99fbfc273 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1062,10 +1062,8 @@ static int __elevator_change(struct request_queue *q, const char *name)
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name), true);
-	if (!e) {
-		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
+	if (!e)
 		return -EINVAL;
-	}
 
 	if (q->elevator &&
 	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
@@ -1105,7 +1103,6 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 	if (!ret)
 		return count;
 
-	printk(KERN_ERR "elevator: switch to %s failed\n", name);
 	return ret;
 }
 

From d3738123986954ba3abbd96b595f5176b50c3f5d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 9 May 2017 11:39:56 -0600
Subject: [PATCH 08/10] blk-stat: don't use this_cpu_ptr() in a preemptable
 section

If PREEMPT_RCU is enabled, rcu_read_lock() isn't strong enough
for us to use this_cpu_ptr() in that section. Use the safer
get/put_cpu_ptr() variants instead.

Reported-by: Mike Galbraith <efault@gmx.de>
Fixes: 34dbad5d26e2 ("blk-stat: convert to callback-based statistics reporting")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 6c2f40940439..c52356d90fe3 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -96,13 +96,16 @@ void blk_stat_add(struct request *rq)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
-		if (blk_stat_is_active(cb)) {
-			bucket = cb->bucket_fn(rq);
-			if (bucket < 0)
-				continue;
-			stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
-			__blk_stat_add(stat, value);
-		}
+		if (!blk_stat_is_active(cb))
+			continue;
+
+		bucket = cb->bucket_fn(rq);
+		if (bucket < 0)
+			continue;
+
+		stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
+		__blk_stat_add(stat, value);
+		put_cpu_ptr(cb->cpu_stat);
 	}
 	rcu_read_unlock();
 }

From f36ea50ca0043e7b1204feaf1d2ba6bd68c08d36 Mon Sep 17 00:00:00 2001
From: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Date: Wed, 10 May 2017 08:54:11 -0500
Subject: [PATCH 09/10] blk-mq: NVMe 512B/4K+T10 DIF/DIX format returns I/O
 error on dd with split op

When formatting NVMe to 512B/4K + T10 DIf/DIX, dd with split op returns
"Input/output error". Looks block layer split the bio after calling
bio_integrity_prep(bio). This patch fixes the issue.

Below is how we debug this issue:
(1)format nvme to 4K block # size with type 2 DIF
(2)dd with block size bigger than 1024k.
oflag=direct
dd: error writing '/dev/nvme0n1': Input/output error

We added some debug code in nvme device driver. It showed us the first
op and the second op have the same bi and pi address. This is not
correct.

1st op: nvme0n1 Op:Wr slba 0x505 length 0x100, PI ctrl=0x1400,
	dsmgmt=0x0, AT=0x0 & RT=0x505
	Guard 0x00b1, AT 0x0000, RT physical 0x00000505 RT virtual 0x00002828

2nd op: nvme0n1 Op:Wr slba 0x605 length 0x1, PI ctrl=0x1400, dsmgmt=0x0,
	AT=0x0 & RT=0x605  ==> This op fails and subsequent 5 retires..
	Guard 0x00b1, AT 0x0000, RT physical 0x00000605 RT virtual 0x00002828

With the fix, It showed us both of the first op and the second op have
correct bi and pi address.

1st op: nvme2n1 Op:Wr slba 0x505 length 0x100, PI ctrl=0x1400,
	dsmgmt=0x0, AT=0x0 & RT=0x505
	Guard 0x5ccb, AT 0x0000, RT physical 0x00000505 RT virtual
	0x00002828
2nd op: nvme2n1 Op:Wr slba 0x605 length 0x1, PI ctrl=0x1400, dsmgmt=0x0,
	AT=0x0 & RT=0x605
	Guard 0xab4c, AT 0x0000, RT physical 0x00000605 RT virtual
	0x00003028

Signed-off-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9ae1d9ccf4df..a69ad122ed66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1554,13 +1554,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_bounce(q, &bio);
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_io_error(bio);
 		return BLK_QC_T_NONE;
 	}
 
-	blk_queue_split(q, &bio, q->bio_split);
-
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;

From ed6565e734249ef021d5c13ba34c167eb4e42f62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 May 2017 12:34:38 +0200
Subject: [PATCH 10/10] block: handle partial completions for special payload
 requests

SCSI devices can return short writes on Write Same just like for normal
writes, so we need to handle this case for our special payload requests
as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c580b0138a7f..c7068520794b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2644,8 +2644,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		return false;
 	}
 
-	WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD);
-
 	req->__data_len -= total_bytes;
 
 	/* update sector only for requests with clear definition of sector */
@@ -2658,17 +2656,19 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 	}
 
-	/*
-	 * If total number of sectors is less than the first segment
-	 * size, something has gone terribly wrong.
-	 */
-	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-		blk_dump_rq_flags(req, "request botched");
-		req->__data_len = blk_rq_cur_bytes(req);
-	}
+	if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+		/*
+		 * If total number of sectors is less than the first segment
+		 * size, something has gone terribly wrong.
+		 */
+		if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+			blk_dump_rq_flags(req, "request botched");
+			req->__data_len = blk_rq_cur_bytes(req);
+		}
 
-	/* recalculate the number of segments */
-	blk_recalc_rq_segments(req);
+		/* recalculate the number of segments */
+		blk_recalc_rq_segments(req);
+	}
 
 	return true;
 }