From 55180498dfd5f3c7e2d2c0e470f7cede1acee248 Mon Sep 17 00:00:00 2001 From: Zhiqiang Liu Date: Sat, 7 Dec 2019 11:00:08 +0800 Subject: md-bitmap: small cleanups In md_bitmap_unplug, bitmap->storage.filemap is double checked. In md_bitmap_daemon_work, bitmap->storage.filemap should be checked before reference. Signed-off-by: Zhiqiang Liu Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 3ad18246fcb3..9860062bdc1e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1019,8 +1019,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) /* look at each page to see if there are any set bits that need to be * flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) { - if (!bitmap->storage.filemap) - return; dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); @@ -1338,7 +1336,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; - if (test_and_clear_page_attr(bitmap, j, + if (bitmap->storage.filemap && + test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE)) { write_page(bitmap, bitmap->storage.filemap[j], 0); } -- cgit From d2c9ad41249ac862d3a3a4d5d56e6b1cd79d8a17 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 20 Dec 2019 15:46:29 +0100 Subject: raid5: remove worker_cnt_per_group argument from alloc_thread_groups We can use "cnt" directly to update conf->worker_cnt_per_group if alloc_thread_groups returns 0. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d4d3b67ffbba..ba00e9877f02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6598,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, - int *worker_cnt_per_group, struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) @@ -6607,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; - int group_cnt, worker_cnt_per_group; + int group_cnt; if (len >= PAGE_SIZE) return -EINVAL; @@ -6630,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (old_groups) flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); if (!err) { spin_lock_irq(&conf->device_lock); conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = new; conf->worker_groups = new_groups; spin_unlock_irq(&conf->device_lock); @@ -6672,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt, - int *group_cnt, - int *worker_cnt_per_group, +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, struct r5worker_group **worker_groups) { int i, j, k; ssize_t size; struct r5worker *workers; - *worker_cnt_per_group = cnt; if (cnt == 0) { *group_cnt = 0; *worker_groups = NULL; @@ -6882,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct disk_info *disk; char pers_name[6]; int i; - int group_cnt, worker_cnt_per_group; + int group_cnt; struct r5worker_group *new_group; int ret; @@ -6928,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < PENDING_IO_MAX; i++) list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ - if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, - &new_group)) { + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = 0; conf->worker_groups = new_group; } else goto abort; -- cgit From 404659cf1e2570dad3cd117fa3bd71f06ecfd142 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:53 +0100 Subject: md: rename wb stuffs Previously, wb_info_pool and wb_list stuffs are introduced to address potential data inconsistence issue for write behind device. Now rename them to serial related name, since the same mechanism will be used to address reorder overlap write issue for raid1. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 20 +++++++-------- drivers/md/md.c | 70 ++++++++++++++++++++++++++------------------------ drivers/md/md.h | 24 ++++++++--------- drivers/md/raid1.c | 43 ++++++++++++++++--------------- 4 files changed, 80 insertions(+), 77 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9860062bdc1e..212e75dfebb7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,8 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1907,7 +1907,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev, true); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2474,16 +2474,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; - if (!backlog && mddev->wb_info_pool) { - /* wb_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - } else if (backlog && !mddev->wb_info_pool) { - /* wb_info_pool is needed since backlog is not zero */ + if (!backlog && mddev->serial_info_pool) { + /* serial_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } else if (backlog && !mddev->serial_info_pool) { + /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e7c9f398bc6..ea37bfacb6fb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,72 +125,74 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static int rdev_init_wb(struct md_rdev *rdev) +static int rdev_init_serial(struct md_rdev *rdev) { if (rdev->bdev->bd_queue->nr_hw_queues == 1) return 0; - spin_lock_init(&rdev->wb_list_lock); - INIT_LIST_HEAD(&rdev->wb_list); - init_waitqueue_head(&rdev->wb_io_wait); - set_bit(WBCollisionCheck, &rdev->flags); + spin_lock_init(&rdev->serial_list_lock); + INIT_LIST_HEAD(&rdev->serial_list); + init_waitqueue_head(&rdev->serial_io_wait); + set_bit(CollisionCheck, &rdev->flags); return 1; } /* - * Create wb_info_pool if rdev is the first multi-queue device flaged + * Create serial_info_pool if rdev is the first multi-queue device flaged * with writemostly, also write-behind mode is enabled. */ -void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { if (mddev->bitmap_info.max_write_behind == 0) return; - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev)) return; - if (mddev->wb_info_pool == NULL) { + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; if (!is_suspend) mddev_suspend(mddev); noio_flag = memalloc_noio_save(); - mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->wb_info_pool) - pr_err("can't alloc memory pool for writemostly\n"); + if (!mddev->serial_info_pool) + pr_err("can't alloc memory pool for serialization\n"); if (!is_suspend) mddev_resume(mddev); } } -EXPORT_SYMBOL_GPL(mddev_create_wb_pool); +EXPORT_SYMBOL_GPL(mddev_create_serial_pool); /* - * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + * Destroy serial_info_pool if rdev is the last device flaged with + * CollisionCheck. */ -static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { - if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool) { + if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* - * Check if other rdevs need wb_info_pool. + * Check if other rdevs need serial_info_pool. */ rdev_for_each(temp, mddev) if (temp != rdev && - test_bit(WBCollisionCheck, &temp->flags)) + test_bit(CollisionCheck, &temp->flags)) num++; if (!num) { mddev_suspend(rdev->mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; mddev_resume(rdev->mddev); } } @@ -2337,7 +2339,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2375,7 +2377,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2888,10 +2890,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_wb_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -5773,14 +5775,14 @@ int md_run(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && - rdev_init_wb(rdev)) + rdev_init_serial(rdev)) creat_pool = true; } - if (creat_pool && mddev->wb_info_pool == NULL) { - mddev->wb_info_pool = - mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); - if (!mddev->wb_info_pool) { + if (creat_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } @@ -6025,8 +6027,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 5f86f8adb0a4..7b811645cec7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -111,11 +111,11 @@ struct md_rdev { */ /* - * The members for check collision of write behind IOs. + * The members for check collision of write IOs. */ - struct list_head wb_list; - spinlock_t wb_list_lock; - wait_queue_head_t wb_io_wait; + struct list_head serial_list; + spinlock_t serial_list_lock; + wait_queue_head_t serial_io_wait; struct work_struct del_work; /* used for delayed sysfs removal */ @@ -201,9 +201,9 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ - WBCollisionCheck, /* - * multiqueue device should check if there - * is collision between write behind bios. + CollisionCheck, /* + * check if there is collision between raid1 + * serial bios. */ }; @@ -263,9 +263,9 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_WB_INFOS 8 -/* record current range of write behind IOs */ -struct wb_info { +#define NR_SERIAL_INFOS 8 +/* record current range of serialize IOs */ +struct serial_info { sector_t lo; sector_t hi; struct list_head list; @@ -487,7 +487,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ - mempool_t *wb_info_pool; + mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ @@ -737,7 +737,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); -extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 201fd8aec59a..0439f674ab14 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,17 +50,17 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" -static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi, *temp_wi; + struct serial_info *wi, *temp_wi; unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; - wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->wb_list, list) { + spin_lock_irqsave(&rdev->serial_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->serial_list, list) { /* collision happened */ if (hi > temp_wi->lo && lo < temp_wi->hi) { ret = -EBUSY; @@ -71,34 +71,34 @@ static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) if (!ret) { wi->lo = lo; wi->hi = hi; - list_add(&wi->list, &rdev->wb_list); + list_add(&wi->list, &rdev->serial_list); } else - mempool_free(wi, mddev->wb_info_pool); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + mempool_free(wi, mddev->serial_info_pool); + spin_unlock_irqrestore(&rdev->serial_list_lock, flags); return ret; } -static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi; + struct serial_info *wi; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(wi, &rdev->wb_list, list) + spin_lock_irqsave(&rdev->serial_list_lock, flags); + list_for_each_entry(wi, &rdev->serial_list, list) if (hi == wi->hi && lo == wi->lo) { list_del(&wi->list); - mempool_free(wi, mddev->wb_info_pool); + mempool_free(wi, mddev->serial_info_pool); found = 1; break; } if (!found) - WARN(1, "The write behind IO is not recorded\n"); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); - wake_up(&rdev->wb_io_wait); + WARN(1, "The write IO is not recorded for serialization\n"); + spin_unlock_irqrestore(&rdev->serial_list_lock, flags); + wake_up(&rdev->serial_io_wait); } /* @@ -499,11 +499,11 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WBCollisionCheck, &rdev->flags)) { + if (test_bit(CollisionCheck, &rdev->flags)) { sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - remove_wb(rdev, lo, hi); + remove_serial(rdev, lo, hi); } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1508,12 +1508,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { struct md_rdev *rdev = conf->mirrors[i].rdev; - if (test_bit(WBCollisionCheck, &rdev->flags)) { + if (test_bit(CollisionCheck, &rdev->flags)) { sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - wait_event(rdev->wb_io_wait, - check_and_add_wb(rdev, lo, hi) == 0); + wait_event(rdev->serial_io_wait, + check_and_add_serial(rdev, lo, hi) + == 0); } if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); -- cgit From 3e173ab55b990d2b4ceb90bf55a88a96eb88598e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:54 +0100 Subject: md: fix a typo s/creat/create It actually means create here, so fix the typo. Reported-by: Song Liu Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index ea37bfacb6fb..8f5def0cb60a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5771,14 +5771,14 @@ int md_run(struct mddev *mddev) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { - bool creat_pool = false; + bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && rdev_init_serial(rdev)) - creat_pool = true; + create_pool = true; } - if (creat_pool && mddev->serial_info_pool == NULL) { + if (create_pool && mddev->serial_info_pool == NULL) { mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); -- cgit From 11d3a9f65018c9fb3d4f2032aec76af2ba98431c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:55 +0100 Subject: md: prepare for enable raid1 io serialization 1. The related resources (spin_lock, list and waitqueue) are needed for address raid1 reorder overlap issue too, in this case, rdev is set to NULL for mddev_create/destroy_serial_pool which implies all rdevs need to handle these resources. And also add "is_suspend" to mddev_destroy_serial_pool since it will be called under suspended situation, which also makes both create and destroy pool have same arguments. 2. Introduce rdevs_init_serial which is called if raid1 io serialization is enabled since all rdevs need to init related stuffs. 3. rdev_init_serial and clear_bit(CollisionCheck, &rdev->flags) should be called between suspend and resume. No need to export mddev_create_serial_pool since it is only called in md-mod module. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 65 +++++++++++++++++++++++++++++++++++++++------------------ drivers/md/md.h | 2 +- 2 files changed, 46 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8f5def0cb60a..b9b041b7e196 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -127,9 +127,6 @@ static inline int speed_max(struct mddev *mddev) static int rdev_init_serial(struct md_rdev *rdev) { - if (rdev->bdev->bd_queue->nr_hw_queues == 1) - return 0; - spin_lock_init(&rdev->serial_list_lock); INIT_LIST_HEAD(&rdev->serial_list); init_waitqueue_head(&rdev->serial_io_wait); @@ -138,17 +135,29 @@ static int rdev_init_serial(struct md_rdev *rdev) return 1; } +static void rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (test_bit(CollisionCheck, &rdev->flags)) + continue; + rdev_init_serial(rdev); + } +} + /* - * Create serial_info_pool if rdev is the first multi-queue device flaged - * with writemostly, also write-behind mode is enabled. + * Create serial_info_pool for raid1 under conditions: + * 1. rdev is the first multi-queue device flaged with writemostly, + * also write-behind mode is enabled. + * 2. rdev is NULL, means want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) + bool is_suspend) { - if (mddev->bitmap_info.max_write_behind == 0) - return; - - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_serial(rdev)) + if (rdev && (mddev->bitmap_info.max_write_behind == 0 || + rdev->bdev->bd_queue->nr_hw_queues == 1 || + !test_bit(WriteMostly, &rdev->flags))) return; if (mddev->serial_info_pool == NULL) { @@ -156,6 +165,10 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, if (!is_suspend) mddev_suspend(mddev); + if (!rdev) + rdevs_init_serial(mddev); + else + rdev_init_serial(rdev); noio_flag = memalloc_noio_save(); mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, @@ -167,15 +180,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_resume(mddev); } } -EXPORT_SYMBOL_GPL(mddev_create_serial_pool); /* * Destroy serial_info_pool if rdev is the last device flaged with - * CollisionCheck. + * CollisionCheck, or rdev is NULL when we disable serialization + * for normal raid1. */ -static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) +static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { - if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; if (mddev->serial_info_pool) { @@ -185,16 +199,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) /* * Check if other rdevs need serial_info_pool. */ - rdev_for_each(temp, mddev) + if (!is_suspend) + mddev_suspend(mddev); + rdev_for_each(temp, mddev) { + if (!rdev) { + clear_bit(CollisionCheck, &temp->flags); + continue; + } + if (temp != rdev && test_bit(CollisionCheck, &temp->flags)) num++; - if (!num) { - mddev_suspend(rdev->mddev); + } + + if (rdev) + clear_bit(CollisionCheck, &rdev->flags); + if (!rdev || !num) { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; - mddev_resume(rdev->mddev); } + if (!is_suspend) + mddev_resume(mddev); } } @@ -2377,7 +2402,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_serial_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2893,7 +2918,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b811645cec7..de04a8d3a67a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -738,7 +738,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); -- cgit From 3938f5fb82aedbf39792ffee448c61c819e6ab38 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:56 +0100 Subject: md: add serialize_policy sysfs node for raid1 With the new sysfs node, we can use it to control if raid1 array wants io serialization or not. So mddev_create_serial_pool and mddev_destroy_serial_pool are called in serialize_policy_store to enable or disable the serialization. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 1 + 2 files changed, 53 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index b9b041b7e196..796cf70e1c9f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5304,6 +5304,57 @@ static struct md_sysfs_entry md_fail_last_dev = __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, fail_last_dev_store); +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->level != 1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->level != 1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + mddev_suspend(mddev); + if (value) + mddev_create_serial_pool(mddev, NULL, true); + else + mddev_destroy_serial_pool(mddev, NULL, true); + mddev->serialize_policy = value; + mddev_resume(mddev); +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5321,6 +5372,7 @@ static struct attribute *md_default_attrs[] = { &max_corr_read_errors.attr, &md_consistency_policy.attr, &md_fail_last_dev.attr, + &md_serialize_policy.attr, NULL, }; diff --git a/drivers/md/md.h b/drivers/md/md.h index de04a8d3a67a..f51a3afaee1b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -494,6 +494,7 @@ struct mddev { bool has_superblocks:1; bool fail_last_dev:1; + bool serialize_policy:1; }; enum recovery_flags { -- cgit From de31ee949739aba9ce7dbb8b10e72c6fce0e76c7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:57 +0100 Subject: md: reorgnize mddev_create/destroy_serial_pool So far, IO serialization is used for two scenarios: 1. raid1 which enables write-behind mode, and there is rdev in the array which is multi-queue device and flaged with writemostly. 2. IO serialization is enabled or disabled by change serialize_policy. So introduce rdev_need_serial to check the first scenario. And for 1, IO serialization is enabled automatically while 2 is controlled manually. And it is possible that both scenarios are true, so for create serial pool, rdev/rdevs_init_serial should be separate from check if the pool existed or not. Then for destroy pool, we need to check if the pool is needed by other rdevs due to the first scenario. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 71 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 29 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 796cf70e1c9f..788559f42d43 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -147,28 +147,40 @@ static void rdevs_init_serial(struct mddev *mddev) } /* - * Create serial_info_pool for raid1 under conditions: - * 1. rdev is the first multi-queue device flaged with writemostly, - * also write-behind mode is enabled. - * 2. rdev is NULL, means want to enable serialization for all rdevs. + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. + */ +static int rdev_need_serial(struct md_rdev *rdev) +{ + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { - if (rdev && (mddev->bitmap_info.max_write_behind == 0 || - rdev->bdev->bd_queue->nr_hw_queues == 1 || - !test_bit(WriteMostly, &rdev->flags))) + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) return; + if (!is_suspend) + mddev_suspend(mddev); + + if (!rdev) + rdevs_init_serial(mddev); + else + rdev_init_serial(rdev); + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) - rdevs_init_serial(mddev); - else - rdev_init_serial(rdev); noio_flag = memalloc_noio_save(); mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, @@ -176,15 +188,16 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, memalloc_noio_restore(noio_flag); if (!mddev->serial_info_pool) pr_err("can't alloc memory pool for serialization\n"); - if (!is_suspend) - mddev_resume(mddev); } + if (!is_suspend) + mddev_resume(mddev); } /* - * Destroy serial_info_pool if rdev is the last device flaged with - * CollisionCheck, or rdev is NULL when we disable serialization - * for normal raid1. + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) @@ -194,27 +207,27 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, if (mddev->serial_info_pool) { struct md_rdev *temp; - int num = 0; + int num = 0; /* used to track if other rdevs need the pool */ - /* - * Check if other rdevs need serial_info_pool. - */ if (!is_suspend) mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { - clear_bit(CollisionCheck, &temp->flags); - continue; - } - - if (temp != rdev && - test_bit(CollisionCheck, &temp->flags)) + if (!rdev_need_serial(temp)) + clear_bit(CollisionCheck, &temp->flags); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) num++; } if (rdev) clear_bit(CollisionCheck, &rdev->flags); - if (!rdev || !num) { + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } -- cgit From 69df9cfc70421fb7949e8f7a19bfc36600b5522b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:58 +0100 Subject: raid1: serialize the overlap write Before dispatch write bio, raid1 array which enables serialize_policy need to check if overlap exists between this bio and previous on-flying bios. If there is overlap, then it has to wait until the collision is disappeared. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0439f674ab14..3ad2f5a59d08 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -430,6 +430,8 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; @@ -499,12 +501,8 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(CollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - + if (test_bit(CollisionCheck, &rdev->flags)) remove_serial(rdev, lo, hi); - } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -527,7 +525,8 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } + } else if (rdev->mddev->serialize_policy) + remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -1337,6 +1336,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; + sector_t lo, hi; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1364,6 +1364,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; + lo = r1_bio->sector; + hi = r1_bio->sector + r1_bio->sectors; if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1479,6 +1481,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; + struct md_rdev *rdev = conf->mirrors[i].rdev; if (!r1_bio->bios[i]) continue; @@ -1506,19 +1509,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - - if (test_bit(CollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - + if (test_bit(CollisionCheck, &rdev->flags)) wait_event(rdev->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); - } if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); - } + } else if (mddev->serialize_policy) + wait_event(rdev->serial_io_wait, + check_and_add_serial(rdev, lo, hi) == 0); r1_bio->bios[i] = mbio; -- cgit From 4d26d32fe4dafd29e168addb7c11949a36e7e5f8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:48:59 +0100 Subject: md: don't destroy serial_info_pool if serialize_policy is true The serial_info_pool is needed if array sets serialize_policy to true, so don't destroy it. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 212e75dfebb7..92f0d45946e8 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,8 +1789,10 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + if (!mddev->serialize_policy) { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -2476,8 +2478,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + if (!mddev->serialize_policy) { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; -- cgit From 69b00b5bb23552d43e8bbed73ef6624604bb94a2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:00 +0100 Subject: md: introduce a new struct for IO serialization Obviously, IO serialization could cause the degradation of performance a lot. In order to reduce the degradation, so a rb interval tree is added in raid1 to speed up the check of collision. So, a rb root is needed in md_rdev, then abstract all the serialize related members to a new struct (serial_in_rdev), embed it into md_rdev. Of course, we need to free the struct if it is not needed anymore, so rdev/rdevs_uninit_serial are added accordingly. And they should be called when destroty memory pool or can't alloc memory. And we need to consider to call mddev_destroy_serial_pool in case serialize_policy/write-behind is disabled, bitmap is destroyed or in __md_stop_writes. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 12 +++----- drivers/md/md.c | 80 ++++++++++++++++++++++++++++++++++++++------------ drivers/md/md.h | 26 ++++++++++------ drivers/md/raid1.c | 61 ++++++++++++++++++++------------------ 4 files changed, 116 insertions(+), 63 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 92f0d45946e8..e230052c2107 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1789,10 +1789,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - if (!mddev->serialize_policy) { - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; - } + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, true); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -2478,10 +2476,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - if (!mddev->serialize_policy) { - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; - } + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, false); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; diff --git a/drivers/md/md.c b/drivers/md/md.c index 788559f42d43..9c4e61c988ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,25 +125,59 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static void rdev_uninit_serial(struct md_rdev *rdev) +{ + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; + + kfree(rdev->serial); + rdev->serial = NULL; +} + +static void rdevs_uninit_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} + static int rdev_init_serial(struct md_rdev *rdev) { - spin_lock_init(&rdev->serial_list_lock); - INIT_LIST_HEAD(&rdev->serial_list); - init_waitqueue_head(&rdev->serial_io_wait); + struct serial_in_rdev *serial = NULL; + + if (test_bit(CollisionCheck, &rdev->flags)) + return 0; + + serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL); + if (!serial) + return -ENOMEM; + + spin_lock_init(&serial->serial_lock); + serial->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial->serial_io_wait); + rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); - return 1; + return 0; } -static void rdevs_init_serial(struct mddev *mddev) +static int rdevs_init_serial(struct mddev *mddev) { struct md_rdev *rdev; + int ret = 0; rdev_for_each(rdev, mddev) { - if (test_bit(CollisionCheck, &rdev->flags)) - continue; - rdev_init_serial(rdev); + ret = rdev_init_serial(rdev); + if (ret) + break; } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); + + return ret; } /* @@ -166,6 +200,8 @@ static int rdev_need_serial(struct md_rdev *rdev) void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend) { + int ret = 0; + if (rdev && !rdev_need_serial(rdev) && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -174,9 +210,11 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_suspend(mddev); if (!rdev) - rdevs_init_serial(mddev); + ret = rdevs_init_serial(mddev); else - rdev_init_serial(rdev); + ret = rdev_init_serial(rdev); + if (ret) + goto abort; if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; @@ -186,9 +224,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->serial_info_pool) + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); + } } + +abort: if (!is_suspend) mddev_resume(mddev); } @@ -199,8 +241,8 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -213,8 +255,9 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { - if (!rdev_need_serial(temp)) - clear_bit(CollisionCheck, &temp->flags); + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); else num++; } else if (temp != rdev && @@ -223,7 +266,7 @@ static void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, } if (rdev) - clear_bit(CollisionCheck, &rdev->flags); + rdev_uninit_serial(rdev); if (num) pr_info("The mempool could be used by other devices\n"); @@ -6117,8 +6160,9 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->serial_info_pool); - mddev->serial_info_pool = NULL; + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL, true); } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index f51a3afaee1b..acd681939112 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -32,6 +32,16 @@ * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) + +/* + * The struct embedded in rdev is used to serialize IO. + */ +struct serial_in_rdev { + struct rb_root_cached serial_rb; + spinlock_t serial_lock; + wait_queue_head_t serial_io_wait; +}; + /* * MD's 'extended' device */ @@ -110,12 +120,7 @@ struct md_rdev { * in superblock. */ - /* - * The members for check collision of write IOs. - */ - struct list_head serial_list; - spinlock_t serial_list_lock; - wait_queue_head_t serial_io_wait; + struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -266,9 +271,10 @@ enum mddev_sb_flags { #define NR_SERIAL_INFOS 8 /* record current range of serialize IOs */ struct serial_info { - sector_t lo; - sector_t hi; - struct list_head list; + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ }; struct mddev { @@ -740,6 +746,8 @@ extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, bool is_suspend); +extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3ad2f5a59d08..5c6a03747448 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -50,55 +51,58 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, + START, LAST, static inline, raid1_rb); + static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *wi, *temp_wi; + struct serial_info *si; unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; + struct serial_in_rdev *serial = rdev->serial; - wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - - spin_lock_irqsave(&rdev->serial_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->serial_list, list) { - /* collision happened */ - if (hi > temp_wi->lo && lo < temp_wi->hi) { - ret = -EBUSY; - break; - } - } + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + spin_lock_irqsave(&serial->serial_lock, flags); + /* collision happened */ + if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + ret = -EBUSY; if (!ret) { - wi->lo = lo; - wi->hi = hi; - list_add(&wi->list, &rdev->serial_list); + si->start = lo; + si->last = hi; + raid1_rb_insert(si, &serial->serial_rb); } else - mempool_free(wi, mddev->serial_info_pool); - spin_unlock_irqrestore(&rdev->serial_list_lock, flags); + mempool_free(si, mddev->serial_info_pool); + spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *wi; + struct serial_info *si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - - spin_lock_irqsave(&rdev->serial_list_lock, flags); - list_for_each_entry(wi, &rdev->serial_list, list) - if (hi == wi->hi && lo == wi->lo) { - list_del(&wi->list); - mempool_free(wi, mddev->serial_info_pool); + struct serial_in_rdev *serial = rdev->serial; + + spin_lock_irqsave(&serial->serial_lock, flags); + for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + si; si = raid1_rb_iter_next(si, lo, hi)) { + if (si->start == lo && si->last == hi) { + raid1_rb_remove(si, &serial->serial_rb); + mempool_free(si, mddev->serial_info_pool); found = 1; break; } - + } if (!found) WARN(1, "The write IO is not recorded for serialization\n"); - spin_unlock_irqrestore(&rdev->serial_list_lock, flags); - wake_up(&rdev->serial_io_wait); + spin_unlock_irqrestore(&serial->serial_lock, flags); + wake_up(&serial->serial_io_wait); } /* @@ -1482,6 +1486,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; + struct serial_in_rdev *serial = rdev->serial; if (!r1_bio->bios[i]) continue; @@ -1510,13 +1515,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { if (test_bit(CollisionCheck, &rdev->flags)) - wait_event(rdev->serial_io_wait, + wait_event(serial->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } else if (mddev->serialize_policy) - wait_event(rdev->serial_io_wait, + wait_event(serial->serial_io_wait, check_and_add_serial(rdev, lo, hi) == 0); r1_bio->bios[i] = mbio; -- cgit From 025471f9f50fede6527c70336484becbcb2aff28 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:01 +0100 Subject: md/raid1: use bucket based mechanism for IO serialization Since raid1 had already used bucket based mechanism to reduce the conflict between write IO and resync IO, it is possible to speed up performance for io serialization with refer to the same mechanism. To align with the barrier bucket mechanism, we created arrays (with the same number of BARRIER_BUCKETS_NR) for spinlock, rb tree and waitqueue. Then we can reduce lock competition with multiple spinlocks, boost search performance with multiple rb trees and also reduce thundering herd problem with multiple waitqueues. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 18 +++++++++++++----- drivers/md/raid1.c | 9 ++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c4e61c988ac..4824d50526fa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -130,7 +130,7 @@ static void rdev_uninit_serial(struct md_rdev *rdev) if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) return; - kfree(rdev->serial); + kvfree(rdev->serial); rdev->serial = NULL; } @@ -144,18 +144,26 @@ static void rdevs_uninit_serial(struct mddev *mddev) static int rdev_init_serial(struct md_rdev *rdev) { + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); struct serial_in_rdev *serial = NULL; if (test_bit(CollisionCheck, &rdev->flags)) return 0; - serial = kmalloc(sizeof(struct serial_in_rdev), GFP_KERNEL); + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); if (!serial) return -ENOMEM; - spin_lock_init(&serial->serial_lock); - serial->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial->serial_io_wait); + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; + + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + rdev->serial = serial; set_bit(CollisionCheck, &rdev->flags); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5c6a03747448..48d553d7989a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -62,7 +62,8 @@ static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) unsigned long flags; int ret = 0; struct mddev *mddev = rdev->mddev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); @@ -87,7 +88,8 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; spin_lock_irqsave(&serial->serial_lock, flags); for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); @@ -1486,7 +1488,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; - struct serial_in_rdev *serial = rdev->serial; + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; if (!r1_bio->bios[i]) continue; -- cgit From d0d2d8ba0494655a01b97542c083e51b29cf8637 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 23 Dec 2019 10:49:02 +0100 Subject: md/raid1: introduce wait_for_serialization Previously, we call check_and_add_serial when serialization is enabled for write IO, but it could allocate and free memory back and forth. Now, let's just get an element from memory pool with the new function, then insert node to rb tree if no collision happens. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 48d553d7989a..cd810e195086 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -56,32 +56,43 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); -static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) +static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, + struct serial_info *si, int idx) { - struct serial_info *si; unsigned long flags; int ret = 0; - struct mddev *mddev = rdev->mddev; - int idx = sector_to_idx(lo); + sector_t lo = r1_bio->sector; + sector_t hi = lo + r1_bio->sectors; struct serial_in_rdev *serial = &rdev->serial[idx]; - si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) ret = -EBUSY; - if (!ret) { + else { si->start = lo; si->last = hi; raid1_rb_insert(si, &serial->serial_rb); - } else - mempool_free(si, mddev->serial_info_pool); + } spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } +static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + struct mddev *mddev = rdev->mddev; + struct serial_info *si; + int idx = sector_to_idx(r1_bio->sector); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + if (WARN_ON(!mddev->serial_info_pool)) + return; + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + wait_event(serial->serial_io_wait, + check_and_add_serial(rdev, r1_bio, si, idx) == 0); +} + static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { struct serial_info *si; @@ -1342,7 +1353,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; - sector_t lo, hi; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1370,8 +1380,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; - lo = r1_bio->sector; - hi = r1_bio->sector + r1_bio->sectors; if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1488,8 +1496,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; struct md_rdev *rdev = conf->mirrors[i].rdev; - int idx = sector_to_idx(lo); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (!r1_bio->bios[i]) continue; @@ -1518,14 +1524,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (r1_bio->behind_master_bio) { if (test_bit(CollisionCheck, &rdev->flags)) - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, lo, hi) - == 0); + wait_for_serialization(rdev, r1_bio); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } else if (mddev->serialize_policy) - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, lo, hi) == 0); + wait_for_serialization(rdev, r1_bio); r1_bio->bios[i] = mbio; -- cgit From e8547d42095e58bee658f00fef8e33d2a185c927 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 24 Jan 2020 01:01:26 +0800 Subject: bcache: cached_dev_free needs to put the sb page Same as cache device, the buffer page needs to be put while freeing cached_dev. Otherwise a page would be leaked every time a cached_dev is stopped. Signed-off-by: Liang Chen Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 77e9869345e7..a573ce1d85aa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1275,6 +1275,9 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); + if (dc->sb_bio.bi_inline_vecs[0].bv_page) + put_page(bio_first_page_all(&dc->sb_bio)); + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -- cgit From a702a692cd7559053ea573f4e2c84828f0e62824 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:27 +0800 Subject: bcache: use a separate data structure for the on-disk super block Split out an on-disk version struct cache_sb with the proper endianness annotations. This fixes a fair chunk of sparse warnings, but there are some left due to the way the checksum is defined. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 6 +++--- include/uapi/linux/bcache.h | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a573ce1d85aa..3045f27e0d67 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -63,14 +63,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, struct page **res) { const char *err; - struct cache_sb *s; + struct cache_sb_disk *s; struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); unsigned int i; if (!bh) return "IO error"; - s = (struct cache_sb *) bh->b_data; + s = (struct cache_sb_disk *)bh->b_data; sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -209,7 +209,7 @@ static void write_bdev_super_endio(struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio) { - struct cache_sb *out = page_address(bio_first_page_all(bio)); + struct cache_sb_disk *out = page_address(bio_first_page_all(bio)); unsigned int i; bio->bi_iter.bi_sector = SB_SECTOR; diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 5d4f58e059fd..1d8b3a9fc080 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -156,6 +156,57 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) #define BDEV_DATA_START_DEFAULT 16 /* sectors */ +struct cache_sb_disk { + __le64 csum; + __le64 offset; /* sector where this sb was written */ + __le64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __le64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __le64 flags; + __le64 seq; + __le64 pad[8]; + + union { + struct { + /* Cache devices */ + __le64 nbuckets; /* device size */ + + __le16 block_size; /* sectors */ + __le16 bucket_size; /* sectors */ + + __le16 nr_in_set; + __le16 nr_this_dev; + }; + struct { + /* Backing devices */ + __le64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __le32 last_mount; /* time overflow in y2106 */ + + __le16 first_bucket; + union { + __le16 njournal_buckets; + __le16 keys; + }; + __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + struct cache_sb { __u64 csum; __u64 offset; /* sector where this sb was written */ -- cgit From 50246693f81fe887f4db78bf7089051d7f1894cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:28 +0800 Subject: bcache: rework error unwinding in register_bcache Split the successful and error return path, and use one goto label for each resource to unwind. This also fixes some small errors like leaking the module reference count in the reboot case (which seems entirely harmless) or printing the wrong warning messages for early failures. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 75 ++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3045f27e0d67..e8013e1b0a14 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2375,29 +2375,33 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; - char *path = NULL; - struct cache_sb *sb = NULL; + const char *err; + char *path; + struct cache_sb *sb; struct block_device *bdev = NULL; - struct page *sb_page = NULL; + struct page *sb_page; + ssize_t ret; + ret = -EBUSY; if (!try_module_get(THIS_MODULE)) - return -EBUSY; + goto out; /* For latest state of bcache_is_reboot */ smp_mb(); if (bcache_is_reboot) - return -EBUSY; + goto out_module_put; + ret = -ENOMEM; + err = "cannot allocate memory"; path = kstrndup(buffer, size, GFP_KERNEL); if (!path) - goto err; + goto out_module_put; sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) - goto err; + goto out_free_path; + ret = -EINVAL; err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, @@ -2414,57 +2418,68 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto quiet_out; + goto done; } - goto err; + goto out_free_sb; } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) - goto err_close; + goto out_blkdev_put; err = read_super(sb, bdev, &sb_page); if (err) - goto err_close; + goto out_blkdev_put; err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) - goto err_close; + goto out_put_sb_page; mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ - if (ret < 0) - goto err; + if (ret < 0) { + bdev = NULL; + goto out_put_sb_page; + } } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err_close; + goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; + if (register_cache(sb, sb_page, bdev, ca) != 0) { + bdev = NULL; + goto out_put_sb_page; + } } -quiet_out: - ret = size; -out: - if (sb_page) - put_page(sb_page); + + put_page(sb_page); +done: kfree(sb); kfree(path); module_put(THIS_MODULE); - return ret; - -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: + return size; + +out_put_sb_page: + put_page(sb_page); +out_blkdev_put: + if (bdev) + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: + kfree(sb); +out_free_path: + kfree(path); +out_module_put: + module_put(THIS_MODULE); +out: pr_info("error %s: %s", path, err); - goto out; + return ret; } -- cgit From 29cda393bcaad160c4bf3676ddd99855adafc72f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:29 +0800 Subject: bcache: properly initialize 'path' and 'err' in register_bcache() Patch "bcache: rework error unwinding in register_bcache" from Christoph Hellwig changes the local variables 'path' and 'err' in undefined initial state. If the code in register_bcache() jumps to label 'out:' or 'out_module_put:' by goto, these two variables might be reference with undefined value by the following line, out_module_put: module_put(THIS_MODULE); out: pr_info("error %s: %s", path, err); return ret; Therefore this patch initializes these two local variables properly in register_bcache() to avoid such issue. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e8013e1b0a14..ee7c87f38d0d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2376,18 +2376,20 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { const char *err; - char *path; + char *path = NULL; struct cache_sb *sb; struct block_device *bdev = NULL; struct page *sb_page; ssize_t ret; ret = -EBUSY; + err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) goto out; /* For latest state of bcache_is_reboot */ smp_mb(); + err = "bcache is in reboot"; if (bcache_is_reboot) goto out_module_put; -- cgit From ae3cd299919af6eb670d5af0bc9d7ba14086bd8e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:30 +0800 Subject: bcache: fix use-after-free in register_bcache() The patch "bcache: rework error unwinding in register_bcache" introduces a use-after-free regression in register_bcache(). Here are current code, 2510 out_free_path: 2511 kfree(path); 2512 out_module_put: 2513 module_put(THIS_MODULE); 2514 out: 2515 pr_info("error %s: %s", path, err); 2516 return ret; If some error happens and the above code path is executed, at line 2511 path is released, but referenced at line 2515. Then KASAN reports a use- after-free error message. This patch changes line 2515 in the following way to fix the problem, 2515 pr_info("error %s: %s", path?path:"", err); Signed-off-by: Coly Li Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ee7c87f38d0d..fad9c6cbee5e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2477,10 +2477,11 @@ out_free_sb: kfree(sb); out_free_path: kfree(path); + path = NULL; out_module_put: module_put(THIS_MODULE); out: - pr_info("error %s: %s", path, err); + pr_info("error %s: %s", path?path:"", err); return ret; } -- cgit From fc8f19cc5dce183cde4fecb9a1da07d81fa6ea8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:31 +0800 Subject: bcache: transfer the sb_page reference to register_{bdev,cache} Avoid an extra reference count roundtrip by transferring the sb_page ownership to the lower level register helpers. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fad9c6cbee5e..cf2cafef381f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1368,8 +1368,6 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; - get_page(sb_page); - if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -2275,7 +2273,6 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; - get_page(sb_page); if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); @@ -2378,7 +2375,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *err; char *path = NULL; struct cache_sb *sb; - struct block_device *bdev = NULL; + struct block_device *bdev; struct page *sb_page; ssize_t ret; @@ -2444,10 +2441,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ - if (ret < 0) { - bdev = NULL; - goto out_put_sb_page; - } + if (ret < 0) + goto out_free_sb; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -2455,13 +2450,10 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) { - bdev = NULL; - goto out_put_sb_page; - } + if (register_cache(sb, sb_page, bdev, ca) != 0) + goto out_free_sb; } - put_page(sb_page); done: kfree(sb); kfree(path); @@ -2471,8 +2463,7 @@ done: out_put_sb_page: put_page(sb_page); out_blkdev_put: - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); out_free_sb: kfree(sb); out_free_path: -- cgit From cfa0c56db9c087227988f6af2c7b9888d24a3b16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:32 +0800 Subject: bcache: return a pointer to the on-disk sb from read_super Returning the properly typed actual data structure insteaf of the containing struct page will save the callers some work going forward. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cf2cafef381f..b30e4c6011d2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -60,7 +60,7 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, - struct page **res) + struct cache_sb_disk **res) { const char *err; struct cache_sb_disk *s; @@ -191,7 +191,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, err = NULL; get_page(bh->b_page); - *res = bh->b_page; + *res = s; err: put_bh(bh); return err; @@ -1353,7 +1353,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static int register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cached_dev *dc) { @@ -1367,7 +1367,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, dc->bdev->bd_holder = dc; bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; + bio_first_bvec_all(&dc->sb_bio)->bv_page = virt_to_page(sb_disk); if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -2260,7 +2260,7 @@ err_free: return ret; } -static int register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cache *ca) { const char *err = NULL; /* must be set for any error case */ @@ -2272,7 +2272,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ca->bdev->bd_holder = ca; bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; + bio_first_bvec_all(&ca->sb_bio)->bv_page = virt_to_page(sb_disk); if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); @@ -2375,8 +2375,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *err; char *path = NULL; struct cache_sb *sb; + struct cache_sb_disk *sb_disk; struct block_device *bdev; - struct page *sb_page; ssize_t ret; ret = -EBUSY; @@ -2426,7 +2426,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (set_blocksize(bdev, 4096)) goto out_blkdev_put; - err = read_super(sb, bdev, &sb_page); + err = read_super(sb, bdev, &sb_disk); if (err) goto out_blkdev_put; @@ -2438,7 +2438,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_put_sb_page; mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) @@ -2450,7 +2450,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) + if (register_cache(sb, sb_disk, bdev, ca) != 0) goto out_free_sb; } @@ -2461,7 +2461,7 @@ done: return size; out_put_sb_page: - put_page(sb_page); + put_page(virt_to_page(sb_disk)); out_blkdev_put: blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); out_free_sb: -- cgit From 475389ae5c08656f8bb5cc3adc88a531c7c4877f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:33 +0800 Subject: bcache: store a pointer to the on-disk sb in the cache and cached_dev structures This allows to properly build the superblock bio including the offset in the page using the normal bio helpers. This fixes writing the superblock for page sizes larger than 4k where the sb write bio would need an offset in the bio_vec. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 ++ drivers/md/bcache/super.c | 34 +++++++++++++++------------------- 2 files changed, 17 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 9198c1b480d9..adf26a21fcd1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -301,6 +301,7 @@ struct cached_dev { struct block_device *bdev; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct closure sb_write; @@ -403,6 +404,7 @@ enum alloc_reserve { struct cache { struct cache_set *set; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b30e4c6011d2..5797c03f993e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -207,15 +207,15 @@ static void write_bdev_super_endio(struct bio *bio) closure_put(&dc->sb_write); } -static void __write_super(struct cache_sb *sb, struct bio *bio) +static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, + struct bio *bio) { - struct cache_sb_disk *out = page_address(bio_first_page_all(bio)); unsigned int i; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_iter.bi_size = SB_SIZE; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, NULL); + __bio_add_page(bio, virt_to_page(out), SB_SIZE, + offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); out->version = cpu_to_le64(sb->version); @@ -257,14 +257,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); - bio_reset(bio); + bio_init(bio, dc->sb_bv, 1); bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; closure_get(cl); /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); + __write_super(&dc->sb, dc->sb_disk, bio); closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } @@ -306,13 +306,13 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); - bio_reset(bio); + bio_init(bio, ca->sb_bv, 1); bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; closure_get(cl); - __write_super(&ca->sb, bio); + __write_super(&ca->sb, ca->sb_disk, bio); } closure_return_with_destructor(cl, bcache_write_super_unlock); @@ -1275,8 +1275,8 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); - if (dc->sb_bio.bi_inline_vecs[0].bv_page) - put_page(bio_first_page_all(&dc->sb_bio)); + if (dc->sb_disk) + put_page(virt_to_page(dc->sb_disk)); if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1365,9 +1365,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; - - bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&dc->sb_bio)->bv_page = virt_to_page(sb_disk); + dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -2137,8 +2135,8 @@ void bch_cache_release(struct kobject *kobj) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(bio_first_page_all(&ca->sb_bio)); + if (ca->sb_disk) + put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -2270,9 +2268,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; - - bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&ca->sb_bio)->bv_page = virt_to_page(sb_disk); + ca->sb_disk = sb_disk; if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); -- cgit From 6321bef028de43724c47cfa7f9dee69ecb783796 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jan 2020 01:01:34 +0800 Subject: bcache: use read_cache_page_gfp to read the superblock Avoid a pointless dependency on buffer heads in bcache by simply open coding reading a single page. Also add a SB_OFFSET define for the byte offset of the superblock instead of using magic numbers. Signed-off-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 16 +++++++--------- include/uapi/linux/bcache.h | 1 + 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5797c03f993e..3dea1d5acd5c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -15,7 +15,6 @@ #include "writeback.h" #include -#include #include #include #include @@ -64,13 +63,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, { const char *err; struct cache_sb_disk *s; - struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + struct page *page; unsigned int i; - if (!bh) + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(page)) return "IO error"; - - s = (struct cache_sb_disk *)bh->b_data; + s = page_address(page) + offset_in_page(SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -188,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, } sb->last_mount = (u32)ktime_get_real_seconds(); - err = NULL; - - get_page(bh->b_page); *res = s; + return NULL; err: - put_bh(bh); + put_page(page); return err; } diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 1d8b3a9fc080..9a1965c6c3d0 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -148,6 +148,7 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) #define BCACHE_SB_MAX_VERSION 4 #define SB_SECTOR 8 +#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT) #define SB_SIZE 4096 #define SB_LABEL_SIZE 32 #define SB_JOURNAL_BUCKETS 256U -- cgit From 7a0bc2a8966040d54289b64842d55e2cf4343ad9 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:36 +0800 Subject: bcache: add code comments for state->pool in __btree_sort() To explain the pages allocated from mempool state->pool can be swapped in __btree_sort(), because state->pool is a page pool, which allocates pages by alloc_pages() indeed. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cffcdc9feefb..4385303836d8 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1257,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * Our temporary buffer is the same size as the btree node's * buffer, we can just swap buffers instead of doing a big * memcpy() + * + * Don't worry event 'out' is allocated from mempool, it can + * still be swapped here. Because state->pool is a page mempool + * creaated by by mempool_init_page_pool(), which allocates + * pages by alloc_pages() indeed. */ out->magic = b->set->data->magic; -- cgit From 2aa8c529387c25606fdc1484154b92f8bfbc5746 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:37 +0800 Subject: bcache: avoid unnecessary btree nodes flushing in btree_flush_write() the commit 91be66e1318f ("bcache: performance improvement for btree_flush_write()") was an effort to flushing btree node with oldest btree node faster in following methods, - Only iterate dirty btree nodes in c->btree_cache, avoid scanning a lot of clean btree nodes. - Take c->btree_cache as a LRU-like list, aggressively flushing all dirty nodes from tail of c->btree_cache util the btree node with oldest journal entry is flushed. This is to reduce the time of holding c->bucket_lock. Guoju Fang and Shuang Li reported that they observe unexptected extra write I/Os on cache device after applying the above patch. Guoju Fang provideed more detailed diagnose information that the aggressive btree nodes flushing may cause 10x more btree nodes to flush in his workload. He points out when system memory is large enough to hold all btree nodes in memory, c->btree_cache is not a LRU-like list any more. Then the btree node with oldest journal entry is very probably not- close to the tail of c->btree_cache list. In such situation much more dirty btree nodes will be aggressively flushed before the target node is flushed. When slow SATA SSD is used as cache device, such over- aggressive flushing behavior will cause performance regression. After spending a lot of time on debug and diagnose, I find the real condition is more complicated, aggressive flushing dirty btree nodes from tail of c->btree_cache list is not a good solution. - When all btree nodes are cached in memory, c->btree_cache is not a LRU-like list, the btree nodes with oldest journal entry won't be close to the tail of the list. - There can be hundreds dirty btree nodes reference the oldest journal entry, before flushing all the nodes the oldest journal entry cannot be reclaimed. When the above two conditions mixed together, a simply flushing from tail of c->btree_cache list is really NOT a good idea. Fortunately there is still chance to make btree_flush_write() work better. Here is how this patch avoids unnecessary btree nodes flushing, - Only acquire c->journal.lock when getting oldest journal entry of fifo c->journal.pin. In rested locations check the journal entries locklessly, so their values can be changed on other cores in parallel. - In loop list_for_each_entry_safe_reverse(), checking latest front point of fifo c->journal.pin. If it is different from the original point which we get with locking c->journal.lock, it means the oldest journal entry is reclaim on other cores. At this moment, all selected dirty nodes recorded in array btree_nodes[] are all flushed and clean on other CPU cores, it is unncessary to iterate c->btree_cache any longer. Just quit the list_for_each_entry_safe_reverse() loop and the following for-loop will skip all the selected clean nodes. - Find a proper time to quit the list_for_each_entry_safe_reverse() loop. Check the refcount value of orignial fifo front point, if the value is larger than selected node number of btree_nodes[], it means more matching btree nodes should be scanned. Otherwise it means no more matching btee nodes in rest of c->btree_cache list, the loop can be quit. If the original oldest journal entry is reclaimed and fifo front point is updated, the refcount of original fifo front point will be 0, then the loop will be quit too. - Not hold c->bucket_lock too long time. c->bucket_lock is also required for space allocation for cached data, hold it for too long time will block regular I/O requests. When iterating list c->btree_cache, even there are a lot of maching btree nodes, in order to not holding c->bucket_lock for too long time, only BTREE_FLUSH_NR nodes are selected and to flush in following for-loop. With this patch, only btree nodes referencing oldest journal entry are flushed to cache device, no aggressive flushing for unnecessary btree node any more. And in order to avoid blocking regluar I/O requests, each time when btree_flush_write() called, at most only BTREE_FLUSH_NR btree nodes are selected to flush, even there are more maching btree nodes in list c->btree_cache. At last, one more thing to explain: Why it is safe to read front point of c->journal.pin without holding c->journal.lock inside the list_for_each_entry_safe_reverse() loop ? Here is my answer: When reading the front point of fifo c->journal.pin, we don't need to know the exact value of front point, we just want to check whether the value is different from the original front point (which is accurate value because we get it while c->jouranl.lock is held). For such purpose, it works as expected without holding c->journal.lock. Even the front point is changed on other CPU core and not updated to local core, and current iterating btree node has identical journal entry local as original fetched fifo front point, it is still safe. Because after holding mutex b->write_lock (with memory barrier) this btree node can be found as clean and skipped, the loop will quite latter when iterate on next node of list c->btree_cache. Fixes: 91be66e1318f ("bcache: performance improvement for btree_flush_write()") Reported-by: Guoju Fang Reported-by: Shuang Li Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 80 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index be2a2a201603..33ddc5269e8d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -417,10 +417,14 @@ err: /* Journalling */ +#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask)) + static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; - unsigned int i, n; + unsigned int i, nr, ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; if (c->journal.btree_flushing) return; @@ -433,12 +437,50 @@ static void btree_flush_write(struct cache_set *c) c->journal.btree_flushing = true; spin_unlock(&c->journal.flush_write_lock); + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } + spin_unlock(&c->journal.lock); + + mask = c->journal.pin.mask; + nr = 0; atomic_long_inc(&c->flush_write); memset(btree_nodes, 0, sizeof(btree_nodes)); - n = 0; mutex_lock(&c->bucket_lock); list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + if (btree_node_journal_flush(b)) pr_err("BUG: flush_write bit should not be set here!"); @@ -454,17 +496,44 @@ static void btree_flush_write(struct cache_set *c) continue; } + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the folowing for-loop. + */ + if (nr_to_fifo_front(btree_current_write(b)->journal, + fifo_front_p, + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } + set_btree_node_journal_flush(b); mutex_unlock(&b->write_lock); - btree_nodes[n++] = b; - if (n == BTREE_FLUSH_NR) + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) break; } mutex_unlock(&c->bucket_lock); - for (i = 0; i < n; i++) { + for (i = 0; i < nr; i++) { b = btree_nodes[i]; if (!b) { pr_err("BUG: btree_nodes[%d] is NULL", i); @@ -497,6 +566,7 @@ static void btree_flush_write(struct cache_set *c) mutex_unlock(&b->write_lock); } +out: spin_lock(&c->journal.flush_write_lock); c->journal.btree_flushing = false; spin_unlock(&c->journal.flush_write_lock); -- cgit From 125d98edd11464c8e0ec9eaaba7d682d0f832686 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:40 +0800 Subject: bcache: remove member accessed from struct btree The member 'accessed' of struct btree is used in bch_mca_scan() when shrinking btree node caches. The original idea is, if b->accessed is set, clean it and look at next btree node cache from c->btree_cache list, and only shrink the caches whose b->accessed is cleaned. Then only cold btree node cache will be shrunk. But when I/O pressure is high, it is very probably that b->accessed of a btree node cache will be set again in bch_btree_node_get() before bch_mca_scan() selects it again. Then there is no chance for bch_mca_scan() to shrink enough memory back to slub or slab system. This patch removes member accessed from struct btree, then once a btree node ache is selected, it will be immediately shunk. By this change, bch_mca_scan() may release btree node cahce more efficiently. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 8 ++------ drivers/md/bcache/btree.h | 2 -- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 14d6c33b0957..357535a5c89c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -754,14 +754,12 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, b = list_first_entry(&c->btree_cache, struct btree, list); list_rotate_left(&c->btree_cache); - if (!b->accessed && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; - } else - b->accessed = 0; + } } out: mutex_unlock(&c->bucket_lock); @@ -1069,7 +1067,6 @@ retry: BUG_ON(!b->written); b->parent = parent; - b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { prefetch(b->keys.set[i].tree); @@ -1160,7 +1157,6 @@ retry: goto retry; } - b->accessed = 1; b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 76cfd121a486..f4dcca449391 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -121,8 +121,6 @@ struct btree { /* Key/pointer for this btree node */ BKEY_PADDED(key); - /* Single bit - set when accessed, cleared by shrinker */ - unsigned long accessed; unsigned long seq; struct rw_semaphore lock; struct cache_set *c; -- cgit From d5c9c470b01177e4d90cdbf178b8c7f37f5b8795 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:41 +0800 Subject: bcache: reap c->btree_cache_freeable from the tail in bch_mca_scan() In order to skip the most recently freed btree node cahce, currently in bch_mca_scan() the first 3 caches in c->btree_cache_freeable list are skipped when shrinking bcache node caches in bch_mca_scan(). The related code in bch_mca_scan() is, 737 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 738 if (nr <= 0) 739 goto out; 740 741 if (++i > 3 && 742 !mca_reap(b, 0, false)) { lines free cache memory 746 } 747 nr--; 748 } The problem is, if virtual memory code calls bch_mca_scan() and the calculated 'nr' is 1 or 2, then in the above loop, nothing will be shunk. In such case, if slub/slab manager calls bch_mca_scan() for many times with small scan number, it does not help to shrink cache memory and just wasts CPU cycles. This patch just selects btree node caches from tail of the c->btree_cache_freeable list, then the newly freed host cache can still be allocated by mca_alloc(), and at least 1 node can be shunk. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 357535a5c89c..c3a314deb09d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -734,17 +734,17 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, i = 0; btree_cache_used = c->btree_cache_used; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out; - if (++i > 3 && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } nr--; + i++; } for (; (nr--) && i < btree_cache_used; i++) { -- cgit From e3de04469a49ee09c89e80bf821508df458ccee6 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 24 Jan 2020 01:01:42 +0800 Subject: bcache: reap from tail of c->btree_cache in bch_mca_scan() When shrink btree node cache from c->btree_cache in bch_mca_scan(), no matter the selected node is reaped or not, it will be rotated from the head to the tail of c->btree_cache list. But in bcache journal code, when flushing the btree nodes with oldest journal entry, btree nodes are iterated and slected from the tail of c->btree_cache list in btree_flush_write(). The list_rotate_left() in bch_mca_scan() will make btree_flush_write() iterate more nodes in c->btree_list in reverse order. This patch just reaps the selected btree node cache, and not move it from the head to the tail of c->btree_cache list. Then bch_mca_scan() will not mess up c->btree_cache list to btree_flush_write(). Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index c3a314deb09d..fa872df4e770 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -747,19 +747,19 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, i++; } - for (; (nr--) && i < btree_cache_used; i++) { - if (list_empty(&c->btree_cache)) + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) goto out; - b = list_first_entry(&c->btree_cache, struct btree, list); - list_rotate_left(&c->btree_cache); - if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; } + + nr--; + i++; } out: mutex_unlock(&c->bucket_lock); -- cgit