From 55180498dfd5f3c7e2d2c0e470f7cede1acee248 Mon Sep 17 00:00:00 2001
From: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Date: Sat, 7 Dec 2019 11:00:08 +0800
Subject: md-bitmap: small cleanups

In md_bitmap_unplug, bitmap->storage.filemap is double checked.

In md_bitmap_daemon_work, bitmap->storage.filemap should be checked
before reference.

Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 3ad18246fcb3..9860062bdc1e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1019,8 +1019,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
 	/* look at each page to see if there are any set bits that need to be
 	 * flushed out to disk */
 	for (i = 0; i < bitmap->storage.file_pages; i++) {
-		if (!bitmap->storage.filemap)
-			return;
 		dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
 		need_write = test_and_clear_page_attr(bitmap, i,
 						      BITMAP_PAGE_NEEDWRITE);
@@ -1338,7 +1336,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
 			break;
-		if (test_and_clear_page_attr(bitmap, j,
+		if (bitmap->storage.filemap &&
+		    test_and_clear_page_attr(bitmap, j,
 					     BITMAP_PAGE_NEEDWRITE)) {
 			write_page(bitmap, bitmap->storage.filemap[j], 0);
 		}
-- 
cgit 


From d2c9ad41249ac862d3a3a4d5d56e6b1cd79d8a17 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Fri, 20 Dec 2019 15:46:29 +0100
Subject: raid5: remove worker_cnt_per_group argument from alloc_thread_groups

We can use "cnt" directly to update conf->worker_cnt_per_group
if alloc_thread_groups returns 0.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d4d3b67ffbba..ba00e9877f02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6598,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
 
 static int alloc_thread_groups(struct r5conf *conf, int cnt,
 			       int *group_cnt,
-			       int *worker_cnt_per_group,
 			       struct r5worker_group **worker_groups);
 static ssize_t
 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
@@ -6607,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 	unsigned int new;
 	int err;
 	struct r5worker_group *new_groups, *old_groups;
-	int group_cnt, worker_cnt_per_group;
+	int group_cnt;
 
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
@@ -6630,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 		if (old_groups)
 			flush_workqueue(raid5_wq);
 
-		err = alloc_thread_groups(conf, new,
-					  &group_cnt, &worker_cnt_per_group,
-					  &new_groups);
+		err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
 		if (!err) {
 			spin_lock_irq(&conf->device_lock);
 			conf->group_cnt = group_cnt;
-			conf->worker_cnt_per_group = worker_cnt_per_group;
+			conf->worker_cnt_per_group = new;
 			conf->worker_groups = new_groups;
 			spin_unlock_irq(&conf->device_lock);
 
@@ -6672,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = {
 	.attrs = raid5_attrs,
 };
 
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
-			       int *group_cnt,
-			       int *worker_cnt_per_group,
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
 			       struct r5worker_group **worker_groups)
 {
 	int i, j, k;
 	ssize_t size;
 	struct r5worker *workers;
 
-	*worker_cnt_per_group = cnt;
 	if (cnt == 0) {
 		*group_cnt = 0;
 		*worker_groups = NULL;
@@ -6882,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	struct disk_info *disk;
 	char pers_name[6];
 	int i;
-	int group_cnt, worker_cnt_per_group;
+	int group_cnt;
 	struct r5worker_group *new_group;
 	int ret;
 
@@ -6928,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < PENDING_IO_MAX; i++)
 		list_add(&conf->pending_data[i].sibling, &conf->free_list);
 	/* Don't enable multi-threading by default*/
-	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
-				 &new_group)) {
+	if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
 		conf->group_cnt = group_cnt;
-		conf->worker_cnt_per_group = worker_cnt_per_group;
+		conf->worker_cnt_per_group = 0;
 		conf->worker_groups = new_group;
 	} else
 		goto abort;
-- 
cgit 


From 404659cf1e2570dad3cd117fa3bd71f06ecfd142 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:53 +0100
Subject: md: rename wb stuffs

Previously, wb_info_pool and wb_list stuffs are introduced
to address potential data inconsistence issue for write
behind device.

Now rename them to serial related name, since the same
mechanism will be used to address reorder overlap write
issue for raid1.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 20 +++++++--------
 drivers/md/md.c        | 70 ++++++++++++++++++++++++++------------------------
 drivers/md/md.h        | 24 ++++++++---------
 drivers/md/raid1.c     | 43 ++++++++++++++++---------------
 4 files changed, 80 insertions(+), 77 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 9860062bdc1e..212e75dfebb7 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1789,8 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev)
 		return;
 
 	md_bitmap_wait_behind_writes(mddev);
-	mempool_destroy(mddev->wb_info_pool);
-	mddev->wb_info_pool = NULL;
+	mempool_destroy(mddev->serial_info_pool);
+	mddev->serial_info_pool = NULL;
 
 	mutex_lock(&mddev->bitmap_info.mutex);
 	spin_lock(&mddev->lock);
@@ -1907,7 +1907,7 @@ int md_bitmap_load(struct mddev *mddev)
 		goto out;
 
 	rdev_for_each(rdev, mddev)
-		mddev_create_wb_pool(mddev, rdev, true);
+		mddev_create_serial_pool(mddev, rdev, true);
 
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2474,16 +2474,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	if (backlog > COUNTER_MAX)
 		return -EINVAL;
 	mddev->bitmap_info.max_write_behind = backlog;
-	if (!backlog && mddev->wb_info_pool) {
-		/* wb_info_pool is not needed if backlog is zero */
-		mempool_destroy(mddev->wb_info_pool);
-		mddev->wb_info_pool = NULL;
-	} else if (backlog && !mddev->wb_info_pool) {
-		/* wb_info_pool is needed since backlog is not zero */
+	if (!backlog && mddev->serial_info_pool) {
+		/* serial_info_pool is not needed if backlog is zero */
+		mempool_destroy(mddev->serial_info_pool);
+		mddev->serial_info_pool = NULL;
+	} else if (backlog && !mddev->serial_info_pool) {
+		/* serial_info_pool is needed since backlog is not zero */
 		struct md_rdev *rdev;
 
 		rdev_for_each(rdev, mddev)
-			mddev_create_wb_pool(mddev, rdev, false);
+			mddev_create_serial_pool(mddev, rdev, false);
 	}
 	if (old_mwb != backlog)
 		md_bitmap_update_sb(mddev->bitmap);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e7c9f398bc6..ea37bfacb6fb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -125,72 +125,74 @@ static inline int speed_max(struct mddev *mddev)
 		mddev->sync_speed_max : sysctl_speed_limit_max;
 }
 
-static int rdev_init_wb(struct md_rdev *rdev)
+static int rdev_init_serial(struct md_rdev *rdev)
 {
 	if (rdev->bdev->bd_queue->nr_hw_queues == 1)
 		return 0;
 
-	spin_lock_init(&rdev->wb_list_lock);
-	INIT_LIST_HEAD(&rdev->wb_list);
-	init_waitqueue_head(&rdev->wb_io_wait);
-	set_bit(WBCollisionCheck, &rdev->flags);
+	spin_lock_init(&rdev->serial_list_lock);
+	INIT_LIST_HEAD(&rdev->serial_list);
+	init_waitqueue_head(&rdev->serial_io_wait);
+	set_bit(CollisionCheck, &rdev->flags);
 
 	return 1;
 }
 
 /*
- * Create wb_info_pool if rdev is the first multi-queue device flaged
+ * Create serial_info_pool if rdev is the first multi-queue device flaged
  * with writemostly, also write-behind mode is enabled.
  */
-void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			  bool is_suspend)
 {
 	if (mddev->bitmap_info.max_write_behind == 0)
 		return;
 
-	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev))
 		return;
 
-	if (mddev->wb_info_pool == NULL) {
+	if (mddev->serial_info_pool == NULL) {
 		unsigned int noio_flag;
 
 		if (!is_suspend)
 			mddev_suspend(mddev);
 		noio_flag = memalloc_noio_save();
-		mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
-							sizeof(struct wb_info));
+		mddev->serial_info_pool =
+			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+						sizeof(struct serial_info));
 		memalloc_noio_restore(noio_flag);
-		if (!mddev->wb_info_pool)
-			pr_err("can't alloc memory pool for writemostly\n");
+		if (!mddev->serial_info_pool)
+			pr_err("can't alloc memory pool for serialization\n");
 		if (!is_suspend)
 			mddev_resume(mddev);
 	}
 }
-EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
+EXPORT_SYMBOL_GPL(mddev_create_serial_pool);
 
 /*
- * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ * Destroy serial_info_pool if rdev is the last device flaged with
+ * CollisionCheck.
  */
-static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
+static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
 {
-	if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
 		return;
 
-	if (mddev->wb_info_pool) {
+	if (mddev->serial_info_pool) {
 		struct md_rdev *temp;
 		int num = 0;
 
 		/*
-		 * Check if other rdevs need wb_info_pool.
+		 * Check if other rdevs need serial_info_pool.
 		 */
 		rdev_for_each(temp, mddev)
 			if (temp != rdev &&
-			    test_bit(WBCollisionCheck, &temp->flags))
+			    test_bit(CollisionCheck, &temp->flags))
 				num++;
 		if (!num) {
 			mddev_suspend(rdev->mddev);
-			mempool_destroy(mddev->wb_info_pool);
-			mddev->wb_info_pool = NULL;
+			mempool_destroy(mddev->serial_info_pool);
+			mddev->serial_info_pool = NULL;
 			mddev_resume(rdev->mddev);
 		}
 	}
@@ -2337,7 +2339,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	pr_debug("md: bind<%s>\n", b);
 
 	if (mddev->raid_disks)
-		mddev_create_wb_pool(mddev, rdev, false);
+		mddev_create_serial_pool(mddev, rdev, false);
 
 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
 		goto fail;
@@ -2375,7 +2377,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
 	list_del_rcu(&rdev->same_set);
 	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
-	mddev_destroy_wb_pool(rdev->mddev, rdev);
+	mddev_destroy_serial_pool(rdev->mddev, rdev);
 	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
 	sysfs_put(rdev->sysfs_state);
@@ -2888,10 +2890,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 		}
 	} else if (cmd_match(buf, "writemostly")) {
 		set_bit(WriteMostly, &rdev->flags);
-		mddev_create_wb_pool(rdev->mddev, rdev, false);
+		mddev_create_serial_pool(rdev->mddev, rdev, false);
 		err = 0;
 	} else if (cmd_match(buf, "-writemostly")) {
-		mddev_destroy_wb_pool(rdev->mddev, rdev);
+		mddev_destroy_serial_pool(rdev->mddev, rdev);
 		clear_bit(WriteMostly, &rdev->flags);
 		err = 0;
 	} else if (cmd_match(buf, "blocked")) {
@@ -5773,14 +5775,14 @@ int md_run(struct mddev *mddev)
 
 		rdev_for_each(rdev, mddev) {
 			if (test_bit(WriteMostly, &rdev->flags) &&
-			    rdev_init_wb(rdev))
+			    rdev_init_serial(rdev))
 				creat_pool = true;
 		}
-		if (creat_pool && mddev->wb_info_pool == NULL) {
-			mddev->wb_info_pool =
-				mempool_create_kmalloc_pool(NR_WB_INFOS,
-						    sizeof(struct wb_info));
-			if (!mddev->wb_info_pool) {
+		if (creat_pool && mddev->serial_info_pool == NULL) {
+			mddev->serial_info_pool =
+				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+						    sizeof(struct serial_info));
+			if (!mddev->serial_info_pool) {
 				err = -ENOMEM;
 				goto bitmap_abort;
 			}
@@ -6025,8 +6027,8 @@ static void __md_stop_writes(struct mddev *mddev)
 			mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
 	}
-	mempool_destroy(mddev->wb_info_pool);
-	mddev->wb_info_pool = NULL;
+	mempool_destroy(mddev->serial_info_pool);
+	mddev->serial_info_pool = NULL;
 }
 
 void md_stop_writes(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5f86f8adb0a4..7b811645cec7 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -111,11 +111,11 @@ struct md_rdev {
 					   */
 
 	/*
-	 * The members for check collision of write behind IOs.
+	 * The members for check collision of write IOs.
 	 */
-	struct list_head wb_list;
-	spinlock_t wb_list_lock;
-	wait_queue_head_t wb_io_wait;
+	struct list_head serial_list;
+	spinlock_t serial_list_lock;
+	wait_queue_head_t serial_io_wait;
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
@@ -201,9 +201,9 @@ enum flag_bits {
 				 * it didn't fail, so don't use FailFast
 				 * any more for metadata
 				 */
-	WBCollisionCheck,	/*
-				 * multiqueue device should check if there
-				 * is collision between write behind bios.
+	CollisionCheck,		/*
+				 * check if there is collision between raid1
+				 * serial bios.
 				 */
 };
 
@@ -263,9 +263,9 @@ enum mddev_sb_flags {
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
 };
 
-#define NR_WB_INFOS	8
-/* record current range of write behind IOs */
-struct wb_info {
+#define NR_SERIAL_INFOS		8
+/* record current range of serialize IOs */
+struct serial_info {
 	sector_t lo;
 	sector_t hi;
 	struct list_head list;
@@ -487,7 +487,7 @@ struct mddev {
 					  */
 	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
-	mempool_t *wb_info_pool;
+	mempool_t *serial_info_pool;
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
 	unsigned int			good_device_nr;	/* good device num within cluster raid */
@@ -737,7 +737,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
-extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 				 bool is_suspend);
 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 201fd8aec59a..0439f674ab14 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -50,17 +50,17 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
-static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct wb_info *wi, *temp_wi;
+	struct serial_info *wi, *temp_wi;
 	unsigned long flags;
 	int ret = 0;
 	struct mddev *mddev = rdev->mddev;
 
-	wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
+	wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
 
-	spin_lock_irqsave(&rdev->wb_list_lock, flags);
-	list_for_each_entry(temp_wi, &rdev->wb_list, list) {
+	spin_lock_irqsave(&rdev->serial_list_lock, flags);
+	list_for_each_entry(temp_wi, &rdev->serial_list, list) {
 		/* collision happened */
 		if (hi > temp_wi->lo && lo < temp_wi->hi) {
 			ret = -EBUSY;
@@ -71,34 +71,34 @@ static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
 	if (!ret) {
 		wi->lo = lo;
 		wi->hi = hi;
-		list_add(&wi->list, &rdev->wb_list);
+		list_add(&wi->list, &rdev->serial_list);
 	} else
-		mempool_free(wi, mddev->wb_info_pool);
-	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+		mempool_free(wi, mddev->serial_info_pool);
+	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
 
 	return ret;
 }
 
-static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct wb_info *wi;
+	struct serial_info *wi;
 	unsigned long flags;
 	int found = 0;
 	struct mddev *mddev = rdev->mddev;
 
-	spin_lock_irqsave(&rdev->wb_list_lock, flags);
-	list_for_each_entry(wi, &rdev->wb_list, list)
+	spin_lock_irqsave(&rdev->serial_list_lock, flags);
+	list_for_each_entry(wi, &rdev->serial_list, list)
 		if (hi == wi->hi && lo == wi->lo) {
 			list_del(&wi->list);
-			mempool_free(wi, mddev->wb_info_pool);
+			mempool_free(wi, mddev->serial_info_pool);
 			found = 1;
 			break;
 		}
 
 	if (!found)
-		WARN(1, "The write behind IO is not recorded\n");
-	spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
-	wake_up(&rdev->wb_io_wait);
+		WARN(1, "The write IO is not recorded for serialization\n");
+	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
+	wake_up(&rdev->serial_io_wait);
 }
 
 /*
@@ -499,11 +499,11 @@ static void raid1_end_write_request(struct bio *bio)
 	}
 
 	if (behind) {
-		if (test_bit(WBCollisionCheck, &rdev->flags)) {
+		if (test_bit(CollisionCheck, &rdev->flags)) {
 			sector_t lo = r1_bio->sector;
 			sector_t hi = r1_bio->sector + r1_bio->sectors;
 
-			remove_wb(rdev, lo, hi);
+			remove_serial(rdev, lo, hi);
 		}
 		if (test_bit(WriteMostly, &rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
@@ -1508,12 +1508,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		if (r1_bio->behind_master_bio) {
 			struct md_rdev *rdev = conf->mirrors[i].rdev;
 
-			if (test_bit(WBCollisionCheck, &rdev->flags)) {
+			if (test_bit(CollisionCheck, &rdev->flags)) {
 				sector_t lo = r1_bio->sector;
 				sector_t hi = r1_bio->sector + r1_bio->sectors;
 
-				wait_event(rdev->wb_io_wait,
-					   check_and_add_wb(rdev, lo, hi) == 0);
+				wait_event(rdev->serial_io_wait,
+					   check_and_add_serial(rdev, lo, hi)
+					   == 0);
 			}
 			if (test_bit(WriteMostly, &rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
-- 
cgit 


From 3e173ab55b990d2b4ceb90bf55a88a96eb88598e Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:54 +0100
Subject: md: fix a typo s/creat/create

It actually means create here, so fix the typo.

Reported-by: Song Liu <liu.song.a23@gmail.com>
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ea37bfacb6fb..8f5def0cb60a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5771,14 +5771,14 @@ int md_run(struct mddev *mddev)
 		goto bitmap_abort;
 
 	if (mddev->bitmap_info.max_write_behind > 0) {
-		bool creat_pool = false;
+		bool create_pool = false;
 
 		rdev_for_each(rdev, mddev) {
 			if (test_bit(WriteMostly, &rdev->flags) &&
 			    rdev_init_serial(rdev))
-				creat_pool = true;
+				create_pool = true;
 		}
-		if (creat_pool && mddev->serial_info_pool == NULL) {
+		if (create_pool && mddev->serial_info_pool == NULL) {
 			mddev->serial_info_pool =
 				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
 						    sizeof(struct serial_info));
-- 
cgit 


From 11d3a9f65018c9fb3d4f2032aec76af2ba98431c Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:55 +0100
Subject: md: prepare for enable raid1 io serialization

1. The related resources (spin_lock, list and waitqueue) are needed for
address raid1 reorder overlap issue too, in this case, rdev is set to
NULL for mddev_create/destroy_serial_pool which implies all rdevs need
to handle these resources.

And also add "is_suspend" to mddev_destroy_serial_pool since it will
be called under suspended situation, which also makes both create and
destroy pool have same arguments.

2. Introduce rdevs_init_serial which is called if raid1 io serialization
is enabled since all rdevs need to init related stuffs.

3. rdev_init_serial and clear_bit(CollisionCheck, &rdev->flags) should
be called between suspend and resume.

No need to export mddev_create_serial_pool since it is only called in
md-mod module.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 65 +++++++++++++++++++++++++++++++++++++++------------------
 drivers/md/md.h |  2 +-
 2 files changed, 46 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8f5def0cb60a..b9b041b7e196 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -127,9 +127,6 @@ static inline int speed_max(struct mddev *mddev)
 
 static int rdev_init_serial(struct md_rdev *rdev)
 {
-	if (rdev->bdev->bd_queue->nr_hw_queues == 1)
-		return 0;
-
 	spin_lock_init(&rdev->serial_list_lock);
 	INIT_LIST_HEAD(&rdev->serial_list);
 	init_waitqueue_head(&rdev->serial_io_wait);
@@ -138,17 +135,29 @@ static int rdev_init_serial(struct md_rdev *rdev)
 	return 1;
 }
 
+static void rdevs_init_serial(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, mddev) {
+		if (test_bit(CollisionCheck, &rdev->flags))
+			continue;
+		rdev_init_serial(rdev);
+	}
+}
+
 /*
- * Create serial_info_pool if rdev is the first multi-queue device flaged
- * with writemostly, also write-behind mode is enabled.
+ * Create serial_info_pool for raid1 under conditions:
+ * 1. rdev is the first multi-queue device flaged with writemostly,
+ *    also write-behind mode is enabled.
+ * 2. rdev is NULL, means want to enable serialization for all rdevs.
  */
 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
-			  bool is_suspend)
+			      bool is_suspend)
 {
-	if (mddev->bitmap_info.max_write_behind == 0)
-		return;
-
-	if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev))
+	if (rdev && (mddev->bitmap_info.max_write_behind == 0 ||
+		     rdev->bdev->bd_queue->nr_hw_queues == 1 ||
+		     !test_bit(WriteMostly, &rdev->flags)))
 		return;
 
 	if (mddev->serial_info_pool == NULL) {
@@ -156,6 +165,10 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 
 		if (!is_suspend)
 			mddev_suspend(mddev);
+		if (!rdev)
+			rdevs_init_serial(mddev);
+		else
+			rdev_init_serial(rdev);
 		noio_flag = memalloc_noio_save();
 		mddev->serial_info_pool =
 			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
@@ -167,15 +180,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			mddev_resume(mddev);
 	}
 }
-EXPORT_SYMBOL_GPL(mddev_create_serial_pool);
 
 /*
  * Destroy serial_info_pool if rdev is the last device flaged with
- * CollisionCheck.
+ * CollisionCheck, or rdev is NULL when we disable serialization
+ * for normal raid1.
  */
-static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
+static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+				      bool is_suspend)
 {
-	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
+	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
 		return;
 
 	if (mddev->serial_info_pool) {
@@ -185,16 +199,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
 		/*
 		 * Check if other rdevs need serial_info_pool.
 		 */
-		rdev_for_each(temp, mddev)
+		if (!is_suspend)
+			mddev_suspend(mddev);
+		rdev_for_each(temp, mddev) {
+			if (!rdev) {
+				clear_bit(CollisionCheck, &temp->flags);
+				continue;
+			}
+
 			if (temp != rdev &&
 			    test_bit(CollisionCheck, &temp->flags))
 				num++;
-		if (!num) {
-			mddev_suspend(rdev->mddev);
+		}
+
+		if (rdev)
+			clear_bit(CollisionCheck, &rdev->flags);
+		if (!rdev || !num) {
 			mempool_destroy(mddev->serial_info_pool);
 			mddev->serial_info_pool = NULL;
-			mddev_resume(rdev->mddev);
 		}
+		if (!is_suspend)
+			mddev_resume(mddev);
 	}
 }
 
@@ -2377,7 +2402,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
 	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
 	list_del_rcu(&rdev->same_set);
 	pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
-	mddev_destroy_serial_pool(rdev->mddev, rdev);
+	mddev_destroy_serial_pool(rdev->mddev, rdev, false);
 	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
 	sysfs_put(rdev->sysfs_state);
@@ -2893,7 +2918,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 		mddev_create_serial_pool(rdev->mddev, rdev, false);
 		err = 0;
 	} else if (cmd_match(buf, "-writemostly")) {
-		mddev_destroy_serial_pool(rdev->mddev, rdev);
+		mddev_destroy_serial_pool(rdev->mddev, rdev, false);
 		clear_bit(WriteMostly, &rdev->flags);
 		err = 0;
 	} else if (cmd_match(buf, "blocked")) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7b811645cec7..de04a8d3a67a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -738,7 +738,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
 extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
-				 bool is_suspend);
+				     bool is_suspend);
 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
 
-- 
cgit 


From 3938f5fb82aedbf39792ffee448c61c819e6ab38 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:56 +0100
Subject: md: add serialize_policy sysfs node for raid1

With the new sysfs node, we can use it to control if raid1 array
wants io serialization or not. So mddev_create_serial_pool and
mddev_destroy_serial_pool are called in serialize_policy_store
to enable or disable the serialization.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.h |  1 +
 2 files changed, 53 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b9b041b7e196..796cf70e1c9f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5304,6 +5304,57 @@ static struct md_sysfs_entry md_fail_last_dev =
 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
        fail_last_dev_store);
 
+static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
+{
+	if (mddev->pers == NULL || (mddev->pers->level != 1))
+		return sprintf(page, "n/a\n");
+	else
+		return sprintf(page, "%d\n", mddev->serialize_policy);
+}
+
+/*
+ * Setting serialize_policy to true to enforce write IO is not reordered
+ * for raid1.
+ */
+static ssize_t
+serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	int err;
+	bool value;
+
+	err = kstrtobool(buf, &value);
+	if (err)
+		return err;
+
+	if (value == mddev->serialize_policy)
+		return len;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+		pr_err("md: serialize_policy is only effective for raid1\n");
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	mddev_suspend(mddev);
+	if (value)
+		mddev_create_serial_pool(mddev, NULL, true);
+	else
+		mddev_destroy_serial_pool(mddev, NULL, true);
+	mddev->serialize_policy = value;
+	mddev_resume(mddev);
+unlock:
+	mddev_unlock(mddev);
+	return err ?: len;
+}
+
+static struct md_sysfs_entry md_serialize_policy =
+__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
+       serialize_policy_store);
+
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_layout.attr,
@@ -5321,6 +5372,7 @@ static struct attribute *md_default_attrs[] = {
 	&max_corr_read_errors.attr,
 	&md_consistency_policy.attr,
 	&md_fail_last_dev.attr,
+	&md_serialize_policy.attr,
 	NULL,
 };
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index de04a8d3a67a..f51a3afaee1b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -494,6 +494,7 @@ struct mddev {
 
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;
+	bool	serialize_policy:1;
 };
 
 enum recovery_flags {
-- 
cgit 


From de31ee949739aba9ce7dbb8b10e72c6fce0e76c7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:57 +0100
Subject: md: reorgnize mddev_create/destroy_serial_pool

So far, IO serialization is used for two scenarios:

1. raid1 which enables write-behind mode, and there is rdev in the array
which is multi-queue device and flaged with writemostly.
2. IO serialization is enabled or disabled by change serialize_policy.

So introduce rdev_need_serial to check the first scenario. And for 1, IO
serialization is enabled automatically while 2 is controlled manually.

And it is possible that both scenarios are true, so for create serial pool,
rdev/rdevs_init_serial should be separate from check if the pool existed or
not. Then for destroy pool, we need to check if the pool is needed by other
rdevs due to the first scenario.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 71 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 796cf70e1c9f..788559f42d43 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -147,28 +147,40 @@ static void rdevs_init_serial(struct mddev *mddev)
 }
 
 /*
- * Create serial_info_pool for raid1 under conditions:
- * 1. rdev is the first multi-queue device flaged with writemostly,
- *    also write-behind mode is enabled.
- * 2. rdev is NULL, means want to enable serialization for all rdevs.
+ * rdev needs to enable serial stuffs if it meets the conditions:
+ * 1. it is multi-queue device flaged with writemostly.
+ * 2. the write-behind mode is enabled.
+ */
+static int rdev_need_serial(struct md_rdev *rdev)
+{
+	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
+		rdev->bdev->bd_queue->nr_hw_queues != 1 &&
+		test_bit(WriteMostly, &rdev->flags));
+}
+
+/*
+ * Init resource for rdev(s), then create serial_info_pool if:
+ * 1. rdev is the first device which return true from rdev_enable_serial.
+ * 2. rdev is NULL, means we want to enable serialization for all rdevs.
  */
 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			      bool is_suspend)
 {
-	if (rdev && (mddev->bitmap_info.max_write_behind == 0 ||
-		     rdev->bdev->bd_queue->nr_hw_queues == 1 ||
-		     !test_bit(WriteMostly, &rdev->flags)))
+	if (rdev && !rdev_need_serial(rdev) &&
+	    !test_bit(CollisionCheck, &rdev->flags))
 		return;
 
+	if (!is_suspend)
+		mddev_suspend(mddev);
+
+	if (!rdev)
+		rdevs_init_serial(mddev);
+	else
+		rdev_init_serial(rdev);
+
 	if (mddev->serial_info_pool == NULL) {
 		unsigned int noio_flag;
 
-		if (!is_suspend)
-			mddev_suspend(mddev);
-		if (!rdev)
-			rdevs_init_serial(mddev);
-		else
-			rdev_init_serial(rdev);
 		noio_flag = memalloc_noio_save();
 		mddev->serial_info_pool =
 			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
@@ -176,15 +188,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 		memalloc_noio_restore(noio_flag);
 		if (!mddev->serial_info_pool)
 			pr_err("can't alloc memory pool for serialization\n");
-		if (!is_suspend)
-			mddev_resume(mddev);
 	}
+	if (!is_suspend)
+		mddev_resume(mddev);
 }
 
 /*
- * Destroy serial_info_pool if rdev is the last device flaged with
- * CollisionCheck, or rdev is NULL when we disable serialization
- * for normal raid1.
+ * Free resource from rdev(s), and destroy serial_info_pool under conditions:
+ * 1. rdev is the last device flaged with CollisionCheck.
+ * 2. when bitmap is destroyed while policy is not enabled.
+ * 3. for disable policy, the pool is destroyed only when no rdev needs it.
  */
 static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 				      bool is_suspend)
@@ -194,27 +207,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 
 	if (mddev->serial_info_pool) {
 		struct md_rdev *temp;
-		int num = 0;
+		int num = 0; /* used to track if other rdevs need the pool */
 
-		/*
-		 * Check if other rdevs need serial_info_pool.
-		 */
 		if (!is_suspend)
 			mddev_suspend(mddev);
 		rdev_for_each(temp, mddev) {
 			if (!rdev) {
-				clear_bit(CollisionCheck, &temp->flags);
-				continue;
-			}
-
-			if (temp != rdev &&
-			    test_bit(CollisionCheck, &temp->flags))
+				if (!rdev_need_serial(temp))
+					clear_bit(CollisionCheck, &temp->flags);
+				else
+					num++;
+			} else if (temp != rdev &&
+				   test_bit(CollisionCheck, &temp->flags))
 				num++;
 		}
 
 		if (rdev)
 			clear_bit(CollisionCheck, &rdev->flags);
-		if (!rdev || !num) {
+
+		if (num)
+			pr_info("The mempool could be used by other devices\n");
+		else {
 			mempool_destroy(mddev->serial_info_pool);
 			mddev->serial_info_pool = NULL;
 		}
-- 
cgit 


From 69df9cfc70421fb7949e8f7a19bfc36600b5522b Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:58 +0100
Subject: raid1: serialize the overlap write

Before dispatch write bio, raid1 array which enables
serialize_policy need to check if overlap exists between
this bio and previous on-flying bios. If there is overlap,
then it has to wait until the collision is disappeared.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid1.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0439f674ab14..3ad2f5a59d08 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -430,6 +430,8 @@ static void raid1_end_write_request(struct bio *bio)
 	int mirror = find_bio_disk(r1_bio, bio);
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
 	bool discard_error;
+	sector_t lo = r1_bio->sector;
+	sector_t hi = r1_bio->sector + r1_bio->sectors;
 
 	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 
@@ -499,12 +501,8 @@ static void raid1_end_write_request(struct bio *bio)
 	}
 
 	if (behind) {
-		if (test_bit(CollisionCheck, &rdev->flags)) {
-			sector_t lo = r1_bio->sector;
-			sector_t hi = r1_bio->sector + r1_bio->sectors;
-
+		if (test_bit(CollisionCheck, &rdev->flags))
 			remove_serial(rdev, lo, hi);
-		}
 		if (test_bit(WriteMostly, &rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
 
@@ -527,7 +525,8 @@ static void raid1_end_write_request(struct bio *bio)
 				call_bio_endio(r1_bio);
 			}
 		}
-	}
+	} else if (rdev->mddev->serialize_policy)
+		remove_serial(rdev, lo, hi);
 	if (r1_bio->bios[mirror] == NULL)
 		rdev_dec_pending(rdev, conf->mddev);
 
@@ -1337,6 +1336,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
 	int max_sectors;
+	sector_t lo, hi;
 
 	if (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1364,6 +1364,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
+	lo = r1_bio->sector;
+	hi = r1_bio->sector + r1_bio->sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1479,6 +1481,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
+		struct md_rdev *rdev = conf->mirrors[i].rdev;
 		if (!r1_bio->bios[i])
 			continue;
 
@@ -1506,19 +1509,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
 
 		if (r1_bio->behind_master_bio) {
-			struct md_rdev *rdev = conf->mirrors[i].rdev;
-
-			if (test_bit(CollisionCheck, &rdev->flags)) {
-				sector_t lo = r1_bio->sector;
-				sector_t hi = r1_bio->sector + r1_bio->sectors;
-
+			if (test_bit(CollisionCheck, &rdev->flags))
 				wait_event(rdev->serial_io_wait,
 					   check_and_add_serial(rdev, lo, hi)
 					   == 0);
-			}
 			if (test_bit(WriteMostly, &rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
-		}
+		} else if (mddev->serialize_policy)
+			wait_event(rdev->serial_io_wait,
+				   check_and_add_serial(rdev, lo, hi) == 0);
 
 		r1_bio->bios[i] = mbio;
 
-- 
cgit 


From 4d26d32fe4dafd29e168addb7c11949a36e7e5f8 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:48:59 +0100
Subject: md: don't destroy serial_info_pool if serialize_policy is true

The serial_info_pool is needed if array sets serialize_policy to
true, so don't destroy it.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 212e75dfebb7..92f0d45946e8 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1789,8 +1789,10 @@ void md_bitmap_destroy(struct mddev *mddev)
 		return;
 
 	md_bitmap_wait_behind_writes(mddev);
-	mempool_destroy(mddev->serial_info_pool);
-	mddev->serial_info_pool = NULL;
+	if (!mddev->serialize_policy) {
+		mempool_destroy(mddev->serial_info_pool);
+		mddev->serial_info_pool = NULL;
+	}
 
 	mutex_lock(&mddev->bitmap_info.mutex);
 	spin_lock(&mddev->lock);
@@ -2476,8 +2478,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	mddev->bitmap_info.max_write_behind = backlog;
 	if (!backlog && mddev->serial_info_pool) {
 		/* serial_info_pool is not needed if backlog is zero */
-		mempool_destroy(mddev->serial_info_pool);
-		mddev->serial_info_pool = NULL;
+		if (!mddev->serialize_policy) {
+			mempool_destroy(mddev->serial_info_pool);
+			mddev->serial_info_pool = NULL;
+		}
 	} else if (backlog && !mddev->serial_info_pool) {
 		/* serial_info_pool is needed since backlog is not zero */
 		struct md_rdev *rdev;
-- 
cgit 


From 69b00b5bb23552d43e8bbed73ef6624604bb94a2 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:49:00 +0100
Subject: md: introduce a new struct for IO serialization

Obviously, IO serialization could cause the degradation of
performance a lot. In order to reduce the degradation, so a
rb interval tree is added in raid1 to speed up the check of
collision.

So, a rb root is needed in md_rdev, then abstract all the
serialize related members to a new struct (serial_in_rdev),
embed it into md_rdev.

Of course, we need to free the struct if it is not needed
anymore, so rdev/rdevs_uninit_serial are added accordingly.
And they should be called when destroty memory pool or can't
alloc memory.

And we need to consider to call mddev_destroy_serial_pool
in case serialize_policy/write-behind is disabled, bitmap
is destroyed or in __md_stop_writes.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md-bitmap.c | 12 +++-----
 drivers/md/md.c        | 80 ++++++++++++++++++++++++++++++++++++++------------
 drivers/md/md.h        | 26 ++++++++++------
 drivers/md/raid1.c     | 61 ++++++++++++++++++++------------------
 4 files changed, 116 insertions(+), 63 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 92f0d45946e8..e230052c2107 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1789,10 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev)
 		return;
 
 	md_bitmap_wait_behind_writes(mddev);
-	if (!mddev->serialize_policy) {
-		mempool_destroy(mddev->serial_info_pool);
-		mddev->serial_info_pool = NULL;
-	}
+	if (!mddev->serialize_policy)
+		mddev_destroy_serial_pool(mddev, NULL, true);
 
 	mutex_lock(&mddev->bitmap_info.mutex);
 	spin_lock(&mddev->lock);
@@ -2478,10 +2476,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
 	mddev->bitmap_info.max_write_behind = backlog;
 	if (!backlog && mddev->serial_info_pool) {
 		/* serial_info_pool is not needed if backlog is zero */
-		if (!mddev->serialize_policy) {
-			mempool_destroy(mddev->serial_info_pool);
-			mddev->serial_info_pool = NULL;
-		}
+		if (!mddev->serialize_policy)
+			mddev_destroy_serial_pool(mddev, NULL, false);
 	} else if (backlog && !mddev->serial_info_pool) {
 		/* serial_info_pool is needed since backlog is not zero */
 		struct md_rdev *rdev;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 788559f42d43..9c4e61c988ac 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -125,25 +125,59 @@ static inline int speed_max(struct mddev *mddev)
 		mddev->sync_speed_max : sysctl_speed_limit_max;
 }
 
+static void rdev_uninit_serial(struct md_rdev *rdev)
+{
+	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
+		return;
+
+	kfree(rdev->serial);
+	rdev->serial = NULL;
+}
+
+static void rdevs_uninit_serial(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, mddev)
+		rdev_uninit_serial(rdev);
+}
+
 static int rdev_init_serial(struct md_rdev *rdev)
 {
-	spin_lock_init(&rdev->serial_list_lock);
-	INIT_LIST_HEAD(&rdev->serial_list);
-	init_waitqueue_head(&rdev->serial_io_wait);
+	struct serial_in_rdev *serial = NULL;
+
+	if (test_bit(CollisionCheck, &rdev->flags))
+		return 0;
+
+	serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL);
+	if (!serial)
+		return -ENOMEM;
+
+	spin_lock_init(&serial->serial_lock);
+	serial->serial_rb = RB_ROOT_CACHED;
+	init_waitqueue_head(&serial->serial_io_wait);
+	rdev->serial = serial;
 	set_bit(CollisionCheck, &rdev->flags);
 
-	return 1;
+	return 0;
 }
 
-static void rdevs_init_serial(struct mddev *mddev)
+static int rdevs_init_serial(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
+	int ret = 0;
 
 	rdev_for_each(rdev, mddev) {
-		if (test_bit(CollisionCheck, &rdev->flags))
-			continue;
-		rdev_init_serial(rdev);
+		ret = rdev_init_serial(rdev);
+		if (ret)
+			break;
 	}
+
+	/* Free all resources if pool is not existed */
+	if (ret && !mddev->serial_info_pool)
+		rdevs_uninit_serial(mddev);
+
+	return ret;
 }
 
 /*
@@ -166,6 +200,8 @@ static int rdev_need_serial(struct md_rdev *rdev)
 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			      bool is_suspend)
 {
+	int ret = 0;
+
 	if (rdev && !rdev_need_serial(rdev) &&
 	    !test_bit(CollisionCheck, &rdev->flags))
 		return;
@@ -174,9 +210,11 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 		mddev_suspend(mddev);
 
 	if (!rdev)
-		rdevs_init_serial(mddev);
+		ret = rdevs_init_serial(mddev);
 	else
-		rdev_init_serial(rdev);
+		ret = rdev_init_serial(rdev);
+	if (ret)
+		goto abort;
 
 	if (mddev->serial_info_pool == NULL) {
 		unsigned int noio_flag;
@@ -186,9 +224,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
 						sizeof(struct serial_info));
 		memalloc_noio_restore(noio_flag);
-		if (!mddev->serial_info_pool)
+		if (!mddev->serial_info_pool) {
+			rdevs_uninit_serial(mddev);
 			pr_err("can't alloc memory pool for serialization\n");
+		}
 	}
+
+abort:
 	if (!is_suspend)
 		mddev_resume(mddev);
 }
@@ -199,8 +241,8 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
  * 2. when bitmap is destroyed while policy is not enabled.
  * 3. for disable policy, the pool is destroyed only when no rdev needs it.
  */
-static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
-				      bool is_suspend)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+			       bool is_suspend)
 {
 	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
 		return;
@@ -213,8 +255,9 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 			mddev_suspend(mddev);
 		rdev_for_each(temp, mddev) {
 			if (!rdev) {
-				if (!rdev_need_serial(temp))
-					clear_bit(CollisionCheck, &temp->flags);
+				if (!mddev->serialize_policy ||
+				    !rdev_need_serial(temp))
+					rdev_uninit_serial(temp);
 				else
 					num++;
 			} else if (temp != rdev &&
@@ -223,7 +266,7 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 		}
 
 		if (rdev)
-			clear_bit(CollisionCheck, &rdev->flags);
+			rdev_uninit_serial(rdev);
 
 		if (num)
 			pr_info("The mempool could be used by other devices\n");
@@ -6117,8 +6160,9 @@ static void __md_stop_writes(struct mddev *mddev)
 			mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
 	}
-	mempool_destroy(mddev->serial_info_pool);
-	mddev->serial_info_pool = NULL;
+	/* disable policy to guarantee rdevs free resources for serialization */
+	mddev->serialize_policy = 0;
+	mddev_destroy_serial_pool(mddev, NULL, true);
 }
 
 void md_stop_writes(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f51a3afaee1b..acd681939112 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -32,6 +32,16 @@
  * be retried.
  */
 #define	MD_FAILFAST	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
+
+/*
+ * The struct embedded in rdev is used to serialize IO.
+ */
+struct serial_in_rdev {
+	struct rb_root_cached serial_rb;
+	spinlock_t serial_lock;
+	wait_queue_head_t serial_io_wait;
+};
+
 /*
  * MD's 'extended' device
  */
@@ -110,12 +120,7 @@ struct md_rdev {
 					   * in superblock.
 					   */
 
-	/*
-	 * The members for check collision of write IOs.
-	 */
-	struct list_head serial_list;
-	spinlock_t serial_list_lock;
-	wait_queue_head_t serial_io_wait;
+	struct serial_in_rdev *serial;  /* used for raid1 io serialization */
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
 
@@ -266,9 +271,10 @@ enum mddev_sb_flags {
 #define NR_SERIAL_INFOS		8
 /* record current range of serialize IOs */
 struct serial_info {
-	sector_t lo;
-	sector_t hi;
-	struct list_head list;
+	struct rb_node node;
+	sector_t start;		/* start sector of rb node */
+	sector_t last;		/* end sector of rb node */
+	sector_t _subtree_last; /* highest sector in subtree of rb node */
 };
 
 struct mddev {
@@ -740,6 +746,8 @@ extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
 extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
 				     bool is_suspend);
+extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+				      bool is_suspend);
 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3ad2f5a59d08..5c6a03747448 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/interval_tree_generic.h>
 
 #include <trace/events/block.h>
 
@@ -50,55 +51,58 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
+		     START, LAST, static inline, raid1_rb);
+
 static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct serial_info *wi, *temp_wi;
+	struct serial_info *si;
 	unsigned long flags;
 	int ret = 0;
 	struct mddev *mddev = rdev->mddev;
+	struct serial_in_rdev *serial = rdev->serial;
 
-	wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
-
-	spin_lock_irqsave(&rdev->serial_list_lock, flags);
-	list_for_each_entry(temp_wi, &rdev->serial_list, list) {
-		/* collision happened */
-		if (hi > temp_wi->lo && lo < temp_wi->hi) {
-			ret = -EBUSY;
-			break;
-		}
-	}
+	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
 
+	spin_lock_irqsave(&serial->serial_lock, flags);
+	/* collision happened */
+	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+		ret = -EBUSY;
 	if (!ret) {
-		wi->lo = lo;
-		wi->hi = hi;
-		list_add(&wi->list, &rdev->serial_list);
+		si->start = lo;
+		si->last = hi;
+		raid1_rb_insert(si, &serial->serial_rb);
 	} else
-		mempool_free(wi, mddev->serial_info_pool);
-	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
+		mempool_free(si, mddev->serial_info_pool);
+	spin_unlock_irqrestore(&serial->serial_lock, flags);
 
 	return ret;
 }
 
 static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct serial_info *wi;
+	struct serial_info *si;
 	unsigned long flags;
 	int found = 0;
 	struct mddev *mddev = rdev->mddev;
-
-	spin_lock_irqsave(&rdev->serial_list_lock, flags);
-	list_for_each_entry(wi, &rdev->serial_list, list)
-		if (hi == wi->hi && lo == wi->lo) {
-			list_del(&wi->list);
-			mempool_free(wi, mddev->serial_info_pool);
+	struct serial_in_rdev *serial = rdev->serial;
+
+	spin_lock_irqsave(&serial->serial_lock, flags);
+	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+	     si; si = raid1_rb_iter_next(si, lo, hi)) {
+		if (si->start == lo && si->last == hi) {
+			raid1_rb_remove(si, &serial->serial_rb);
+			mempool_free(si, mddev->serial_info_pool);
 			found = 1;
 			break;
 		}
-
+	}
 	if (!found)
 		WARN(1, "The write IO is not recorded for serialization\n");
-	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
-	wake_up(&rdev->serial_io_wait);
+	spin_unlock_irqrestore(&serial->serial_lock, flags);
+	wake_up(&serial->serial_io_wait);
 }
 
 /*
@@ -1482,6 +1486,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
+		struct serial_in_rdev *serial = rdev->serial;
 		if (!r1_bio->bios[i])
 			continue;
 
@@ -1510,13 +1515,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 		if (r1_bio->behind_master_bio) {
 			if (test_bit(CollisionCheck, &rdev->flags))
-				wait_event(rdev->serial_io_wait,
+				wait_event(serial->serial_io_wait,
 					   check_and_add_serial(rdev, lo, hi)
 					   == 0);
 			if (test_bit(WriteMostly, &rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		} else if (mddev->serialize_policy)
-			wait_event(rdev->serial_io_wait,
+			wait_event(serial->serial_io_wait,
 				   check_and_add_serial(rdev, lo, hi) == 0);
 
 		r1_bio->bios[i] = mbio;
-- 
cgit 


From 025471f9f50fede6527c70336484becbcb2aff28 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:49:01 +0100
Subject: md/raid1: use bucket based mechanism for IO serialization

Since raid1 had already used bucket based mechanism to reduce
the conflict between write IO and resync IO, it is possible to
speed up performance for io serialization with refer to the
same mechanism.

To align with the barrier bucket mechanism, we created arrays
(with the same number of BARRIER_BUCKETS_NR) for spinlock, rb
tree and waitqueue. Then we can reduce lock competition with
multiple spinlocks, boost search performance with multiple rb
trees and also reduce thundering herd problem with multiple
waitqueues.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c    | 18 +++++++++++++-----
 drivers/md/raid1.c |  9 ++++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9c4e61c988ac..4824d50526fa 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -130,7 +130,7 @@ static void rdev_uninit_serial(struct md_rdev *rdev)
 	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
 		return;
 
-	kfree(rdev->serial);
+	kvfree(rdev->serial);
 	rdev->serial = NULL;
 }
 
@@ -144,18 +144,26 @@ static void rdevs_uninit_serial(struct mddev *mddev)
 
 static int rdev_init_serial(struct md_rdev *rdev)
 {
+	/* serial_nums equals with BARRIER_BUCKETS_NR */
+	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
 	struct serial_in_rdev *serial = NULL;
 
 	if (test_bit(CollisionCheck, &rdev->flags))
 		return 0;
 
-	serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL);
+	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
+			  GFP_KERNEL);
 	if (!serial)
 		return -ENOMEM;
 
-	spin_lock_init(&serial->serial_lock);
-	serial->serial_rb = RB_ROOT_CACHED;
-	init_waitqueue_head(&serial->serial_io_wait);
+	for (i = 0; i < serial_nums; i++) {
+		struct serial_in_rdev *serial_tmp = &serial[i];
+
+		spin_lock_init(&serial_tmp->serial_lock);
+		serial_tmp->serial_rb = RB_ROOT_CACHED;
+		init_waitqueue_head(&serial_tmp->serial_io_wait);
+	}
+
 	rdev->serial = serial;
 	set_bit(CollisionCheck, &rdev->flags);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5c6a03747448..48d553d7989a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -62,7 +62,8 @@ static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 	unsigned long flags;
 	int ret = 0;
 	struct mddev *mddev = rdev->mddev;
-	struct serial_in_rdev *serial = rdev->serial;
+	int idx = sector_to_idx(lo);
+	struct serial_in_rdev *serial = &rdev->serial[idx];
 
 	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
 
@@ -87,7 +88,8 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 	unsigned long flags;
 	int found = 0;
 	struct mddev *mddev = rdev->mddev;
-	struct serial_in_rdev *serial = rdev->serial;
+	int idx = sector_to_idx(lo);
+	struct serial_in_rdev *serial = &rdev->serial[idx];
 
 	spin_lock_irqsave(&serial->serial_lock, flags);
 	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
@@ -1486,7 +1488,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
-		struct serial_in_rdev *serial = rdev->serial;
+		int idx = sector_to_idx(lo);
+		struct serial_in_rdev *serial = &rdev->serial[idx];
 		if (!r1_bio->bios[i])
 			continue;
 
-- 
cgit 


From d0d2d8ba0494655a01b97542c083e51b29cf8637 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Mon, 23 Dec 2019 10:49:02 +0100
Subject: md/raid1: introduce wait_for_serialization

Previously, we call check_and_add_serial when serialization is
enabled for write IO, but it could allocate and free memory
back and forth.

Now, let's just get an element from memory pool with the new
function, then insert node to rb tree if no collision happens.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid1.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 48d553d7989a..cd810e195086 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -56,32 +56,43 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
 		     START, LAST, static inline, raid1_rb);
 
-static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
+static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
+				struct serial_info *si, int idx)
 {
-	struct serial_info *si;
 	unsigned long flags;
 	int ret = 0;
-	struct mddev *mddev = rdev->mddev;
-	int idx = sector_to_idx(lo);
+	sector_t lo = r1_bio->sector;
+	sector_t hi = lo + r1_bio->sectors;
 	struct serial_in_rdev *serial = &rdev->serial[idx];
 
-	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
-
 	spin_lock_irqsave(&serial->serial_lock, flags);
 	/* collision happened */
 	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
 		ret = -EBUSY;
-	if (!ret) {
+	else {
 		si->start = lo;
 		si->last = hi;
 		raid1_rb_insert(si, &serial->serial_rb);
-	} else
-		mempool_free(si, mddev->serial_info_pool);
+	}
 	spin_unlock_irqrestore(&serial->serial_lock, flags);
 
 	return ret;
 }
 
+static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+	struct mddev *mddev = rdev->mddev;
+	struct serial_info *si;
+	int idx = sector_to_idx(r1_bio->sector);
+	struct serial_in_rdev *serial = &rdev->serial[idx];
+
+	if (WARN_ON(!mddev->serial_info_pool))
+		return;
+	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
+	wait_event(serial->serial_io_wait,
+		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+}
+
 static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
 	struct serial_info *si;
@@ -1342,7 +1353,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
 	int max_sectors;
-	sector_t lo, hi;
 
 	if (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1370,8 +1380,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
-	lo = r1_bio->sector;
-	hi = r1_bio->sector + r1_bio->sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1488,8 +1496,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
-		int idx = sector_to_idx(lo);
-		struct serial_in_rdev *serial = &rdev->serial[idx];
 		if (!r1_bio->bios[i])
 			continue;
 
@@ -1518,14 +1524,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 		if (r1_bio->behind_master_bio) {
 			if (test_bit(CollisionCheck, &rdev->flags))
-				wait_event(serial->serial_io_wait,
-					   check_and_add_serial(rdev, lo, hi)
-					   == 0);
+				wait_for_serialization(rdev, r1_bio);
 			if (test_bit(WriteMostly, &rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		} else if (mddev->serialize_policy)
-			wait_event(serial->serial_io_wait,
-				   check_and_add_serial(rdev, lo, hi) == 0);
+			wait_for_serialization(rdev, r1_bio);
 
 		r1_bio->bios[i] = mbio;
 
-- 
cgit 


From e8547d42095e58bee658f00fef8e33d2a185c927 Mon Sep 17 00:00:00 2001
From: Liang Chen <liangchen.linux@gmail.com>
Date: Fri, 24 Jan 2020 01:01:26 +0800
Subject: bcache: cached_dev_free needs to put the sb page

Same as cache device, the buffer page needs to be put while
freeing cached_dev.  Otherwise a page would be leaked every
time a cached_dev is stopped.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 77e9869345e7..a573ce1d85aa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1275,6 +1275,9 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_unlock(&bch_register_lock);
 
+	if (dc->sb_bio.bi_inline_vecs[0].bv_page)
+		put_page(bio_first_page_all(&dc->sb_bio));
+
 	if (!IS_ERR_OR_NULL(dc->bdev))
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
-- 
cgit 


From a702a692cd7559053ea573f4e2c84828f0e62824 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:27 +0800
Subject: bcache: use a separate data structure for the on-disk super block

Split out an on-disk version struct cache_sb with the proper endianness
annotations.  This fixes a fair chunk of sparse warnings, but there are
some left due to the way the checksum is defined.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c   |  6 +++---
 include/uapi/linux/bcache.h | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a573ce1d85aa..3045f27e0d67 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -63,14 +63,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 			      struct page **res)
 {
 	const char *err;
-	struct cache_sb *s;
+	struct cache_sb_disk *s;
 	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
 	unsigned int i;
 
 	if (!bh)
 		return "IO error";
 
-	s = (struct cache_sb *) bh->b_data;
+	s = (struct cache_sb_disk *)bh->b_data;
 
 	sb->offset		= le64_to_cpu(s->offset);
 	sb->version		= le64_to_cpu(s->version);
@@ -209,7 +209,7 @@ static void write_bdev_super_endio(struct bio *bio)
 
 static void __write_super(struct cache_sb *sb, struct bio *bio)
 {
-	struct cache_sb *out = page_address(bio_first_page_all(bio));
+	struct cache_sb_disk *out = page_address(bio_first_page_all(bio));
 	unsigned int i;
 
 	bio->bi_iter.bi_sector	= SB_SECTOR;
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 5d4f58e059fd..1d8b3a9fc080 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -156,6 +156,57 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
 
 #define BDEV_DATA_START_DEFAULT		16	/* sectors */
 
+struct cache_sb_disk {
+	__le64			csum;
+	__le64			offset;	/* sector where this sb was written */
+	__le64			version;
+
+	__u8			magic[16];
+
+	__u8			uuid[16];
+	union {
+		__u8		set_uuid[16];
+		__le64		set_magic;
+	};
+	__u8			label[SB_LABEL_SIZE];
+
+	__le64			flags;
+	__le64			seq;
+	__le64			pad[8];
+
+	union {
+	struct {
+		/* Cache devices */
+		__le64		nbuckets;	/* device size */
+
+		__le16		block_size;	/* sectors */
+		__le16		bucket_size;	/* sectors */
+
+		__le16		nr_in_set;
+		__le16		nr_this_dev;
+	};
+	struct {
+		/* Backing devices */
+		__le64		data_offset;
+
+		/*
+		 * block_size from the cache device section is still used by
+		 * backing devices, so don't add anything here until we fix
+		 * things to not need it for backing devices anymore
+		 */
+	};
+	};
+
+	__le32			last_mount;	/* time overflow in y2106 */
+
+	__le16			first_bucket;
+	union {
+		__le16		njournal_buckets;
+		__le16		keys;
+	};
+	__le64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+};
+
 struct cache_sb {
 	__u64			csum;
 	__u64			offset;	/* sector where this sb was written */
-- 
cgit 


From 50246693f81fe887f4db78bf7089051d7f1894cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:28 +0800
Subject: bcache: rework error unwinding in register_bcache

Split the successful and error return path, and use one goto label for each
resource to unwind.  This also fixes some small errors like leaking the
module reference count in the reboot case (which seems entirely harmless)
or printing the wrong warning messages for early failures.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 75 ++++++++++++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 30 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3045f27e0d67..e8013e1b0a14 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2375,29 +2375,33 @@ static bool bch_is_open(struct block_device *bdev)
 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
-	ssize_t ret = -EINVAL;
-	const char *err = "cannot allocate memory";
-	char *path = NULL;
-	struct cache_sb *sb = NULL;
+	const char *err;
+	char *path;
+	struct cache_sb *sb;
 	struct block_device *bdev = NULL;
-	struct page *sb_page = NULL;
+	struct page *sb_page;
+	ssize_t ret;
 
+	ret = -EBUSY;
 	if (!try_module_get(THIS_MODULE))
-		return -EBUSY;
+		goto out;
 
 	/* For latest state of bcache_is_reboot */
 	smp_mb();
 	if (bcache_is_reboot)
-		return -EBUSY;
+		goto out_module_put;
 
+	ret = -ENOMEM;
+	err = "cannot allocate memory";
 	path = kstrndup(buffer, size, GFP_KERNEL);
 	if (!path)
-		goto err;
+		goto out_module_put;
 
 	sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
 	if (!sb)
-		goto err;
+		goto out_free_path;
 
+	ret = -EINVAL;
 	err = "failed to open device";
 	bdev = blkdev_get_by_path(strim(path),
 				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
@@ -2414,57 +2418,68 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			if (!IS_ERR(bdev))
 				bdput(bdev);
 			if (attr == &ksysfs_register_quiet)
-				goto quiet_out;
+				goto done;
 		}
-		goto err;
+		goto out_free_sb;
 	}
 
 	err = "failed to set blocksize";
 	if (set_blocksize(bdev, 4096))
-		goto err_close;
+		goto out_blkdev_put;
 
 	err = read_super(sb, bdev, &sb_page);
 	if (err)
-		goto err_close;
+		goto out_blkdev_put;
 
 	err = "failed to register device";
 	if (SB_IS_BDEV(sb)) {
 		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
 
 		if (!dc)
-			goto err_close;
+			goto out_put_sb_page;
 
 		mutex_lock(&bch_register_lock);
 		ret = register_bdev(sb, sb_page, bdev, dc);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
-		if (ret < 0)
-			goto err;
+		if (ret < 0) {
+			bdev = NULL;
+			goto out_put_sb_page;
+		}
 	} else {
 		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 
 		if (!ca)
-			goto err_close;
+			goto out_put_sb_page;
 
 		/* blkdev_put() will be called in bch_cache_release() */
-		if (register_cache(sb, sb_page, bdev, ca) != 0)
-			goto err;
+		if (register_cache(sb, sb_page, bdev, ca) != 0) {
+			bdev = NULL;
+			goto out_put_sb_page;
+		}
 	}
-quiet_out:
-	ret = size;
-out:
-	if (sb_page)
-		put_page(sb_page);
+
+	put_page(sb_page);
+done:
 	kfree(sb);
 	kfree(path);
 	module_put(THIS_MODULE);
-	return ret;
-
-err_close:
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-err:
+	return size;
+
+out_put_sb_page:
+	put_page(sb_page);
+out_blkdev_put:
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+out_free_sb:
+	kfree(sb);
+out_free_path:
+	kfree(path);
+out_module_put:
+	module_put(THIS_MODULE);
+out:
 	pr_info("error %s: %s", path, err);
-	goto out;
+	return ret;
 }
 
 
-- 
cgit 


From 29cda393bcaad160c4bf3676ddd99855adafc72f Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:29 +0800
Subject: bcache: properly initialize 'path' and 'err' in register_bcache()

Patch "bcache: rework error unwinding in register_bcache" from
Christoph Hellwig changes the local variables 'path' and 'err'
in undefined initial state. If the code in register_bcache() jumps
to label 'out:' or 'out_module_put:' by goto, these two variables
might be reference with undefined value by the following line,

	out_module_put:
	        module_put(THIS_MODULE);
	out:
	        pr_info("error %s: %s", path, err);
	        return ret;

Therefore this patch initializes these two local variables properly
in register_bcache() to avoid such issue.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e8013e1b0a14..ee7c87f38d0d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2376,18 +2376,20 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			       const char *buffer, size_t size)
 {
 	const char *err;
-	char *path;
+	char *path = NULL;
 	struct cache_sb *sb;
 	struct block_device *bdev = NULL;
 	struct page *sb_page;
 	ssize_t ret;
 
 	ret = -EBUSY;
+	err = "failed to reference bcache module";
 	if (!try_module_get(THIS_MODULE))
 		goto out;
 
 	/* For latest state of bcache_is_reboot */
 	smp_mb();
+	err = "bcache is in reboot";
 	if (bcache_is_reboot)
 		goto out_module_put;
 
-- 
cgit 


From ae3cd299919af6eb670d5af0bc9d7ba14086bd8e Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:30 +0800
Subject: bcache: fix use-after-free in register_bcache()

The patch "bcache: rework error unwinding in register_bcache" introduces
a use-after-free regression in register_bcache(). Here are current code,
	2510 out_free_path:
	2511         kfree(path);
	2512 out_module_put:
	2513         module_put(THIS_MODULE);
	2514 out:
	2515         pr_info("error %s: %s", path, err);
	2516         return ret;
If some error happens and the above code path is executed, at line 2511
path is released, but referenced at line 2515. Then KASAN reports a use-
after-free error message.

This patch changes line 2515 in the following way to fix the problem,
	2515         pr_info("error %s: %s", path?path:"", err);

Signed-off-by: Coly Li <colyli@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index ee7c87f38d0d..fad9c6cbee5e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2477,10 +2477,11 @@ out_free_sb:
 	kfree(sb);
 out_free_path:
 	kfree(path);
+	path = NULL;
 out_module_put:
 	module_put(THIS_MODULE);
 out:
-	pr_info("error %s: %s", path, err);
+	pr_info("error %s: %s", path?path:"", err);
 	return ret;
 }
 
-- 
cgit 


From fc8f19cc5dce183cde4fecb9a1da07d81fa6ea8d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:31 +0800
Subject: bcache: transfer the sb_page reference to register_{bdev,cache}

Avoid an extra reference count roundtrip by transferring the sb_page
ownership to the lower level register helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fad9c6cbee5e..cf2cafef381f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1368,8 +1368,6 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
 
 	bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
 	bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
-	get_page(sb_page);
-
 
 	if (cached_dev_init(dc, sb->block_size << 9))
 		goto err;
@@ -2275,7 +2273,6 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
 
 	bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
 	bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
-	get_page(sb_page);
 
 	if (blk_queue_discard(bdev_get_queue(bdev)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
@@ -2378,7 +2375,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	const char *err;
 	char *path = NULL;
 	struct cache_sb *sb;
-	struct block_device *bdev = NULL;
+	struct block_device *bdev;
 	struct page *sb_page;
 	ssize_t ret;
 
@@ -2444,10 +2441,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		ret = register_bdev(sb, sb_page, bdev, dc);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
-		if (ret < 0) {
-			bdev = NULL;
-			goto out_put_sb_page;
-		}
+		if (ret < 0)
+			goto out_free_sb;
 	} else {
 		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 
@@ -2455,13 +2450,10 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			goto out_put_sb_page;
 
 		/* blkdev_put() will be called in bch_cache_release() */
-		if (register_cache(sb, sb_page, bdev, ca) != 0) {
-			bdev = NULL;
-			goto out_put_sb_page;
-		}
+		if (register_cache(sb, sb_page, bdev, ca) != 0)
+			goto out_free_sb;
 	}
 
-	put_page(sb_page);
 done:
 	kfree(sb);
 	kfree(path);
@@ -2471,8 +2463,7 @@ done:
 out_put_sb_page:
 	put_page(sb_page);
 out_blkdev_put:
-	if (bdev)
-		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 out_free_sb:
 	kfree(sb);
 out_free_path:
-- 
cgit 


From cfa0c56db9c087227988f6af2c7b9888d24a3b16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:32 +0800
Subject: bcache: return a pointer to the on-disk sb from read_super

Returning the properly typed actual data structure insteaf of the
containing struct page will save the callers some work going
forward.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index cf2cafef381f..b30e4c6011d2 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -60,7 +60,7 @@ struct workqueue_struct *bch_journal_wq;
 /* Superblock */
 
 static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
-			      struct page **res)
+			      struct cache_sb_disk **res)
 {
 	const char *err;
 	struct cache_sb_disk *s;
@@ -191,7 +191,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 	err = NULL;
 
 	get_page(bh->b_page);
-	*res = bh->b_page;
+	*res = s;
 err:
 	put_bh(bh);
 	return err;
@@ -1353,7 +1353,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 
 /* Cached device - bcache superblock */
 
-static int register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 				 struct block_device *bdev,
 				 struct cached_dev *dc)
 {
@@ -1367,7 +1367,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
 	dc->bdev->bd_holder = dc;
 
 	bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
-	bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
+	bio_first_bvec_all(&dc->sb_bio)->bv_page = virt_to_page(sb_disk);
 
 	if (cached_dev_init(dc, sb->block_size << 9))
 		goto err;
@@ -2260,7 +2260,7 @@ err_free:
 	return ret;
 }
 
-static int register_cache(struct cache_sb *sb, struct page *sb_page,
+static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 				struct block_device *bdev, struct cache *ca)
 {
 	const char *err = NULL; /* must be set for any error case */
@@ -2272,7 +2272,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
 	ca->bdev->bd_holder = ca;
 
 	bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
-	bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
+	bio_first_bvec_all(&ca->sb_bio)->bv_page = virt_to_page(sb_disk);
 
 	if (blk_queue_discard(bdev_get_queue(bdev)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
@@ -2375,8 +2375,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	const char *err;
 	char *path = NULL;
 	struct cache_sb *sb;
+	struct cache_sb_disk *sb_disk;
 	struct block_device *bdev;
-	struct page *sb_page;
 	ssize_t ret;
 
 	ret = -EBUSY;
@@ -2426,7 +2426,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (set_blocksize(bdev, 4096))
 		goto out_blkdev_put;
 
-	err = read_super(sb, bdev, &sb_page);
+	err = read_super(sb, bdev, &sb_disk);
 	if (err)
 		goto out_blkdev_put;
 
@@ -2438,7 +2438,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			goto out_put_sb_page;
 
 		mutex_lock(&bch_register_lock);
-		ret = register_bdev(sb, sb_page, bdev, dc);
+		ret = register_bdev(sb, sb_disk, bdev, dc);
 		mutex_unlock(&bch_register_lock);
 		/* blkdev_put() will be called in cached_dev_free() */
 		if (ret < 0)
@@ -2450,7 +2450,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			goto out_put_sb_page;
 
 		/* blkdev_put() will be called in bch_cache_release() */
-		if (register_cache(sb, sb_page, bdev, ca) != 0)
+		if (register_cache(sb, sb_disk, bdev, ca) != 0)
 			goto out_free_sb;
 	}
 
@@ -2461,7 +2461,7 @@ done:
 	return size;
 
 out_put_sb_page:
-	put_page(sb_page);
+	put_page(virt_to_page(sb_disk));
 out_blkdev_put:
 	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 out_free_sb:
-- 
cgit 


From 475389ae5c08656f8bb5cc3adc88a531c7c4877f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:33 +0800
Subject: bcache: store a pointer to the on-disk sb in the cache and cached_dev
 structures

This allows to properly build the superblock bio including the offset in
the page using the normal bio helpers.  This fixes writing the superblock
for page sizes larger than 4k where the sb write bio would need an offset
in the bio_vec.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h |  2 ++
 drivers/md/bcache/super.c  | 34 +++++++++++++++-------------------
 2 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 9198c1b480d9..adf26a21fcd1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -301,6 +301,7 @@ struct cached_dev {
 	struct block_device	*bdev;
 
 	struct cache_sb		sb;
+	struct cache_sb_disk	*sb_disk;
 	struct bio		sb_bio;
 	struct bio_vec		sb_bv[1];
 	struct closure		sb_write;
@@ -403,6 +404,7 @@ enum alloc_reserve {
 struct cache {
 	struct cache_set	*set;
 	struct cache_sb		sb;
+	struct cache_sb_disk	*sb_disk;
 	struct bio		sb_bio;
 	struct bio_vec		sb_bv[1];
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b30e4c6011d2..5797c03f993e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -207,15 +207,15 @@ static void write_bdev_super_endio(struct bio *bio)
 	closure_put(&dc->sb_write);
 }
 
-static void __write_super(struct cache_sb *sb, struct bio *bio)
+static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
+		struct bio *bio)
 {
-	struct cache_sb_disk *out = page_address(bio_first_page_all(bio));
 	unsigned int i;
 
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
 	bio->bi_iter.bi_sector	= SB_SECTOR;
-	bio->bi_iter.bi_size	= SB_SIZE;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
-	bch_bio_map(bio, NULL);
+	__bio_add_page(bio, virt_to_page(out), SB_SIZE,
+			offset_in_page(out));
 
 	out->offset		= cpu_to_le64(sb->offset);
 	out->version		= cpu_to_le64(sb->version);
@@ -257,14 +257,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	down(&dc->sb_write_mutex);
 	closure_init(cl, parent);
 
-	bio_reset(bio);
+	bio_init(bio, dc->sb_bv, 1);
 	bio_set_dev(bio, dc->bdev);
 	bio->bi_end_io	= write_bdev_super_endio;
 	bio->bi_private = dc;
 
 	closure_get(cl);
 	/* I/O request sent to backing device */
-	__write_super(&dc->sb, bio);
+	__write_super(&dc->sb, dc->sb_disk, bio);
 
 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
@@ -306,13 +306,13 @@ void bcache_write_super(struct cache_set *c)
 
 		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
 
-		bio_reset(bio);
+		bio_init(bio, ca->sb_bv, 1);
 		bio_set_dev(bio, ca->bdev);
 		bio->bi_end_io	= write_super_endio;
 		bio->bi_private = ca;
 
 		closure_get(cl);
-		__write_super(&ca->sb, bio);
+		__write_super(&ca->sb, ca->sb_disk, bio);
 	}
 
 	closure_return_with_destructor(cl, bcache_write_super_unlock);
@@ -1275,8 +1275,8 @@ static void cached_dev_free(struct closure *cl)
 
 	mutex_unlock(&bch_register_lock);
 
-	if (dc->sb_bio.bi_inline_vecs[0].bv_page)
-		put_page(bio_first_page_all(&dc->sb_bio));
+	if (dc->sb_disk)
+		put_page(virt_to_page(dc->sb_disk));
 
 	if (!IS_ERR_OR_NULL(dc->bdev))
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1365,9 +1365,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
 	dc->bdev = bdev;
 	dc->bdev->bd_holder = dc;
-
-	bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
-	bio_first_bvec_all(&dc->sb_bio)->bv_page = virt_to_page(sb_disk);
+	dc->sb_disk = sb_disk;
 
 	if (cached_dev_init(dc, sb->block_size << 9))
 		goto err;
@@ -2137,8 +2135,8 @@ void bch_cache_release(struct kobject *kobj)
 	for (i = 0; i < RESERVE_NR; i++)
 		free_fifo(&ca->free[i]);
 
-	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
-		put_page(bio_first_page_all(&ca->sb_bio));
+	if (ca->sb_disk)
+		put_page(virt_to_page(ca->sb_disk));
 
 	if (!IS_ERR_OR_NULL(ca->bdev))
 		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -2270,9 +2268,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
 	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
 	ca->bdev = bdev;
 	ca->bdev->bd_holder = ca;
-
-	bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
-	bio_first_bvec_all(&ca->sb_bio)->bv_page = virt_to_page(sb_disk);
+	ca->sb_disk = sb_disk;
 
 	if (blk_queue_discard(bdev_get_queue(bdev)))
 		ca->discard = CACHE_DISCARD(&ca->sb);
-- 
cgit 


From 6321bef028de43724c47cfa7f9dee69ecb783796 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Jan 2020 01:01:34 +0800
Subject: bcache: use read_cache_page_gfp to read the superblock

Avoid a pointless dependency on buffer heads in bcache by simply open
coding reading a single page.  Also add a SB_OFFSET define for the
byte offset of the superblock instead of using magic numbers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c   | 16 +++++++---------
 include/uapi/linux/bcache.h |  1 +
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 5797c03f993e..3dea1d5acd5c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -15,7 +15,6 @@
 #include "writeback.h"
 
 #include <linux/blkdev.h>
-#include <linux/buffer_head.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
@@ -64,13 +63,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 {
 	const char *err;
 	struct cache_sb_disk *s;
-	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+	struct page *page;
 	unsigned int i;
 
-	if (!bh)
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				   SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+	if (IS_ERR(page))
 		return "IO error";
-
-	s = (struct cache_sb_disk *)bh->b_data;
+	s = page_address(page) + offset_in_page(SB_OFFSET);
 
 	sb->offset		= le64_to_cpu(s->offset);
 	sb->version		= le64_to_cpu(s->version);
@@ -188,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 	}
 
 	sb->last_mount = (u32)ktime_get_real_seconds();
-	err = NULL;
-
-	get_page(bh->b_page);
 	*res = s;
+	return NULL;
 err:
-	put_bh(bh);
+	put_page(page);
 	return err;
 }
 
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 1d8b3a9fc080..9a1965c6c3d0 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -148,6 +148,7 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
 #define BCACHE_SB_MAX_VERSION		4
 
 #define SB_SECTOR			8
+#define SB_OFFSET			(SB_SECTOR << SECTOR_SHIFT)
 #define SB_SIZE				4096
 #define SB_LABEL_SIZE			32
 #define SB_JOURNAL_BUCKETS		256U
-- 
cgit 


From 7a0bc2a8966040d54289b64842d55e2cf4343ad9 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:36 +0800
Subject: bcache: add code comments for state->pool in __btree_sort()

To explain the pages allocated from mempool state->pool can be
swapped in __btree_sort(), because state->pool is a page pool,
which allocates pages by alloc_pages() indeed.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bset.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index cffcdc9feefb..4385303836d8 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1257,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 		 * Our temporary buffer is the same size as the btree node's
 		 * buffer, we can just swap buffers instead of doing a big
 		 * memcpy()
+		 *
+		 * Don't worry event 'out' is allocated from mempool, it can
+		 * still be swapped here. Because state->pool is a page mempool
+		 * creaated by by mempool_init_page_pool(), which allocates
+		 * pages by alloc_pages() indeed.
 		 */
 
 		out->magic	= b->set->data->magic;
-- 
cgit 


From 2aa8c529387c25606fdc1484154b92f8bfbc5746 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:37 +0800
Subject: bcache: avoid unnecessary btree nodes flushing in btree_flush_write()

the commit 91be66e1318f ("bcache: performance improvement for
btree_flush_write()") was an effort to flushing btree node with oldest
btree node faster in following methods,
- Only iterate dirty btree nodes in c->btree_cache, avoid scanning a lot
  of clean btree nodes.
- Take c->btree_cache as a LRU-like list, aggressively flushing all
  dirty nodes from tail of c->btree_cache util the btree node with
  oldest journal entry is flushed. This is to reduce the time of holding
  c->bucket_lock.

Guoju Fang and Shuang Li reported that they observe unexptected extra
write I/Os on cache device after applying the above patch. Guoju Fang
provideed more detailed diagnose information that the aggressive
btree nodes flushing may cause 10x more btree nodes to flush in his
workload. He points out when system memory is large enough to hold all
btree nodes in memory, c->btree_cache is not a LRU-like list any more.
Then the btree node with oldest journal entry is very probably not-
close to the tail of c->btree_cache list. In such situation much more
dirty btree nodes will be aggressively flushed before the target node
is flushed. When slow SATA SSD is used as cache device, such over-
aggressive flushing behavior will cause performance regression.

After spending a lot of time on debug and diagnose, I find the real
condition is more complicated, aggressive flushing dirty btree nodes
from tail of c->btree_cache list is not a good solution.
- When all btree nodes are cached in memory, c->btree_cache is not
  a LRU-like list, the btree nodes with oldest journal entry won't
  be close to the tail of the list.
- There can be hundreds dirty btree nodes reference the oldest journal
  entry, before flushing all the nodes the oldest journal entry cannot
  be reclaimed.
When the above two conditions mixed together, a simply flushing from
tail of c->btree_cache list is really NOT a good idea.

Fortunately there is still chance to make btree_flush_write() work
better. Here is how this patch avoids unnecessary btree nodes flushing,
- Only acquire c->journal.lock when getting oldest journal entry of
  fifo c->journal.pin. In rested locations check the journal entries
  locklessly, so their values can be changed on other cores
  in parallel.
- In loop list_for_each_entry_safe_reverse(), checking latest front
  point of fifo c->journal.pin. If it is different from the original
  point which we get with locking c->journal.lock, it means the oldest
  journal entry is reclaim on other cores. At this moment, all selected
  dirty nodes recorded in array btree_nodes[] are all flushed and clean
  on other CPU cores, it is unncessary to iterate c->btree_cache any
  longer. Just quit the list_for_each_entry_safe_reverse() loop and
  the following for-loop will skip all the selected clean nodes.
- Find a proper time to quit the list_for_each_entry_safe_reverse()
  loop. Check the refcount value of orignial fifo front point, if the
  value is larger than selected node number of btree_nodes[], it means
  more matching btree nodes should be scanned. Otherwise it means no
  more matching btee nodes in rest of c->btree_cache list, the loop
  can be quit. If the original oldest journal entry is reclaimed and
  fifo front point is updated, the refcount of original fifo front point
  will be 0, then the loop will be quit too.
- Not hold c->bucket_lock too long time. c->bucket_lock is also required
  for space allocation for cached data, hold it for too long time will
  block regular I/O requests. When iterating list c->btree_cache, even
  there are a lot of maching btree nodes, in order to not holding
  c->bucket_lock for too long time, only BTREE_FLUSH_NR nodes are
  selected and to flush in following for-loop.
With this patch, only btree nodes referencing oldest journal entry
are flushed to cache device, no aggressive flushing for  unnecessary
btree node any more. And in order to avoid blocking regluar I/O
requests, each time when btree_flush_write() called, at most only
BTREE_FLUSH_NR btree nodes are selected to flush, even there are more
maching btree nodes in list c->btree_cache.

At last, one more thing to explain: Why it is safe to read front point
of c->journal.pin without holding c->journal.lock inside the
list_for_each_entry_safe_reverse() loop ?

Here is my answer: When reading the front point of fifo c->journal.pin,
we don't need to know the exact value of front point, we just want to
check whether the value is different from the original front point
(which is accurate value because we get it while c->jouranl.lock is
held). For such purpose, it works as expected without holding
c->journal.lock. Even the front point is changed on other CPU core and
not updated to local core, and current iterating btree node has
identical journal entry local as original fetched fifo front point, it
is still safe. Because after holding mutex b->write_lock (with memory
barrier) this btree node can be found as clean and skipped, the loop
will quite latter when iterate on next node of list c->btree_cache.

Fixes: 91be66e1318f ("bcache: performance improvement for btree_flush_write()")
Reported-by: Guoju Fang <fangguoju@gmail.com>
Reported-by: Shuang Li <psymon@bonuscloud.io>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/journal.c | 80 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index be2a2a201603..33ddc5269e8d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -417,10 +417,14 @@ err:
 
 /* Journalling */
 
+#define nr_to_fifo_front(p, front_p, mask)	(((p) - (front_p)) & (mask))
+
 static void btree_flush_write(struct cache_set *c)
 {
 	struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
-	unsigned int i, n;
+	unsigned int i, nr, ref_nr;
+	atomic_t *fifo_front_p, *now_fifo_front_p;
+	size_t mask;
 
 	if (c->journal.btree_flushing)
 		return;
@@ -433,12 +437,50 @@ static void btree_flush_write(struct cache_set *c)
 	c->journal.btree_flushing = true;
 	spin_unlock(&c->journal.flush_write_lock);
 
+	/* get the oldest journal entry and check its refcount */
+	spin_lock(&c->journal.lock);
+	fifo_front_p = &fifo_front(&c->journal.pin);
+	ref_nr = atomic_read(fifo_front_p);
+	if (ref_nr <= 0) {
+		/*
+		 * do nothing if no btree node references
+		 * the oldest journal entry
+		 */
+		spin_unlock(&c->journal.lock);
+		goto out;
+	}
+	spin_unlock(&c->journal.lock);
+
+	mask = c->journal.pin.mask;
+	nr = 0;
 	atomic_long_inc(&c->flush_write);
 	memset(btree_nodes, 0, sizeof(btree_nodes));
-	n = 0;
 
 	mutex_lock(&c->bucket_lock);
 	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+		/*
+		 * It is safe to get now_fifo_front_p without holding
+		 * c->journal.lock here, because we don't need to know
+		 * the exactly accurate value, just check whether the
+		 * front pointer of c->journal.pin is changed.
+		 */
+		now_fifo_front_p = &fifo_front(&c->journal.pin);
+		/*
+		 * If the oldest journal entry is reclaimed and front
+		 * pointer of c->journal.pin changes, it is unnecessary
+		 * to scan c->btree_cache anymore, just quit the loop and
+		 * flush out what we have already.
+		 */
+		if (now_fifo_front_p != fifo_front_p)
+			break;
+		/*
+		 * quit this loop if all matching btree nodes are
+		 * scanned and record in btree_nodes[] already.
+		 */
+		ref_nr = atomic_read(fifo_front_p);
+		if (nr >= ref_nr)
+			break;
+
 		if (btree_node_journal_flush(b))
 			pr_err("BUG: flush_write bit should not be set here!");
 
@@ -454,17 +496,44 @@ static void btree_flush_write(struct cache_set *c)
 			continue;
 		}
 
+		/*
+		 * Only select the btree node which exactly references
+		 * the oldest journal entry.
+		 *
+		 * If the journal entry pointed by fifo_front_p is
+		 * reclaimed in parallel, don't worry:
+		 * - the list_for_each_xxx loop will quit when checking
+		 *   next now_fifo_front_p.
+		 * - If there are matched nodes recorded in btree_nodes[],
+		 *   they are clean now (this is why and how the oldest
+		 *   journal entry can be reclaimed). These selected nodes
+		 *   will be ignored and skipped in the folowing for-loop.
+		 */
+		if (nr_to_fifo_front(btree_current_write(b)->journal,
+				     fifo_front_p,
+				     mask) != 0) {
+			mutex_unlock(&b->write_lock);
+			continue;
+		}
+
 		set_btree_node_journal_flush(b);
 
 		mutex_unlock(&b->write_lock);
 
-		btree_nodes[n++] = b;
-		if (n == BTREE_FLUSH_NR)
+		btree_nodes[nr++] = b;
+		/*
+		 * To avoid holding c->bucket_lock too long time,
+		 * only scan for BTREE_FLUSH_NR matched btree nodes
+		 * at most. If there are more btree nodes reference
+		 * the oldest journal entry, try to flush them next
+		 * time when btree_flush_write() is called.
+		 */
+		if (nr == BTREE_FLUSH_NR)
 			break;
 	}
 	mutex_unlock(&c->bucket_lock);
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < nr; i++) {
 		b = btree_nodes[i];
 		if (!b) {
 			pr_err("BUG: btree_nodes[%d] is NULL", i);
@@ -497,6 +566,7 @@ static void btree_flush_write(struct cache_set *c)
 		mutex_unlock(&b->write_lock);
 	}
 
+out:
 	spin_lock(&c->journal.flush_write_lock);
 	c->journal.btree_flushing = false;
 	spin_unlock(&c->journal.flush_write_lock);
-- 
cgit 


From 125d98edd11464c8e0ec9eaaba7d682d0f832686 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:40 +0800
Subject: bcache: remove member accessed from struct btree

The member 'accessed' of struct btree is used in bch_mca_scan() when
shrinking btree node caches. The original idea is, if b->accessed is
set, clean it and look at next btree node cache from c->btree_cache
list, and only shrink the caches whose b->accessed is cleaned. Then
only cold btree node cache will be shrunk.

But when I/O pressure is high, it is very probably that b->accessed
of a btree node cache will be set again in bch_btree_node_get()
before bch_mca_scan() selects it again. Then there is no chance for
bch_mca_scan() to shrink enough memory back to slub or slab system.

This patch removes member accessed from struct btree, then once a
btree node ache is selected, it will be immediately shunk. By this
change, bch_mca_scan() may release btree node cahce more efficiently.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 8 ++------
 drivers/md/bcache/btree.h | 2 --
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 14d6c33b0957..357535a5c89c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -754,14 +754,12 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		b = list_first_entry(&c->btree_cache, struct btree, list);
 		list_rotate_left(&c->btree_cache);
 
-		if (!b->accessed &&
-		    !mca_reap(b, 0, false)) {
+		if (!mca_reap(b, 0, false)) {
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
 			freed++;
-		} else
-			b->accessed = 0;
+		}
 	}
 out:
 	mutex_unlock(&c->bucket_lock);
@@ -1069,7 +1067,6 @@ retry:
 	BUG_ON(!b->written);
 
 	b->parent = parent;
-	b->accessed = 1;
 
 	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
 		prefetch(b->keys.set[i].tree);
@@ -1160,7 +1157,6 @@ retry:
 		goto retry;
 	}
 
-	b->accessed = 1;
 	b->parent = parent;
 	bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 76cfd121a486..f4dcca449391 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -121,8 +121,6 @@ struct btree {
 	/* Key/pointer for this btree node */
 	BKEY_PADDED(key);
 
-	/* Single bit - set when accessed, cleared by shrinker */
-	unsigned long		accessed;
 	unsigned long		seq;
 	struct rw_semaphore	lock;
 	struct cache_set	*c;
-- 
cgit 


From d5c9c470b01177e4d90cdbf178b8c7f37f5b8795 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:41 +0800
Subject: bcache: reap c->btree_cache_freeable from the tail in bch_mca_scan()

In order to skip the most recently freed btree node cahce, currently
in bch_mca_scan() the first 3 caches in c->btree_cache_freeable list
are skipped when shrinking bcache node caches in bch_mca_scan(). The
related code in bch_mca_scan() is,

 737 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
 738         if (nr <= 0)
 739                 goto out;
 740
 741         if (++i > 3 &&
 742             !mca_reap(b, 0, false)) {
             		lines free cache memory
 746         }
 747         nr--;
 748 }

The problem is, if virtual memory code calls bch_mca_scan() and
the calculated 'nr' is 1 or 2, then in the above loop, nothing will
be shunk. In such case, if slub/slab manager calls bch_mca_scan()
for many times with small scan number, it does not help to shrink
cache memory and just wasts CPU cycles.

This patch just selects btree node caches from tail of the
c->btree_cache_freeable list, then the newly freed host cache can
still be allocated by mca_alloc(), and at least 1 node can be shunk.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 357535a5c89c..c3a314deb09d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -734,17 +734,17 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 
 	i = 0;
 	btree_cache_used = c->btree_cache_used;
-	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+	list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
 		if (nr <= 0)
 			goto out;
 
-		if (++i > 3 &&
-		    !mca_reap(b, 0, false)) {
+		if (!mca_reap(b, 0, false)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
 			freed++;
 		}
 		nr--;
+		i++;
 	}
 
 	for (;  (nr--) && i < btree_cache_used; i++) {
-- 
cgit 


From e3de04469a49ee09c89e80bf821508df458ccee6 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Fri, 24 Jan 2020 01:01:42 +0800
Subject: bcache: reap from tail of c->btree_cache in bch_mca_scan()

When shrink btree node cache from c->btree_cache in bch_mca_scan(),
no matter the selected node is reaped or not, it will be rotated from
the head to the tail of c->btree_cache list. But in bcache journal
code, when flushing the btree nodes with oldest journal entry, btree
nodes are iterated and slected from the tail of c->btree_cache list in
btree_flush_write(). The list_rotate_left() in bch_mca_scan() will
make btree_flush_write() iterate more nodes in c->btree_list in reverse
order.

This patch just reaps the selected btree node cache, and not move it
from the head to the tail of c->btree_cache list. Then bch_mca_scan()
will not mess up c->btree_cache list to btree_flush_write().

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index c3a314deb09d..fa872df4e770 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -747,19 +747,19 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		i++;
 	}
 
-	for (;  (nr--) && i < btree_cache_used; i++) {
-		if (list_empty(&c->btree_cache))
+	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+		if (nr <= 0 || i >= btree_cache_used)
 			goto out;
 
-		b = list_first_entry(&c->btree_cache, struct btree, list);
-		list_rotate_left(&c->btree_cache);
-
 		if (!mca_reap(b, 0, false)) {
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
 			freed++;
 		}
+
+		nr--;
+		i++;
 	}
 out:
 	mutex_unlock(&c->bucket_lock);
-- 
cgit