From 3d8d32873c7b6d9cec5b40c2ddb8c7c55961694f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 27 Sep 2023 14:12:40 +0800 Subject: md: factor out a helper from mddev_put() There are no functional changes, prepare to simplify md_seq_ops in next patch. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230927061241.1552837-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 76e2cf609883..aad91cd72a06 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -616,23 +616,28 @@ static inline struct mddev *mddev_get(struct mddev *mddev) static void mddev_delayed_delete(struct work_struct *ws); +static void __mddev_put(struct mddev *mddev) +{ + if (mddev->raid_disks || !list_empty(&mddev->disks) || + mddev->ctime || mddev->hold_active) + return; + + /* Array is not configured at all, and not held active, so destroy it */ + set_bit(MD_DELETED, &mddev->flags); + + /* + * Call queue_work inside the spinlock so that flush_workqueue() after + * mddev_find will succeed in waiting for the work to be done. + */ + queue_work(md_misc_wq, &mddev->del_work); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks) && - mddev->ctime == 0 && !mddev->hold_active) { - /* Array is not configured at all, and not held active, - * so destroy it */ - set_bit(MD_DELETED, &mddev->flags); - /* - * Call queue_work inside the spinlock so that - * flush_workqueue() after mddev_find will succeed in waiting - * for the work to be done. - */ - queue_work(md_misc_wq, &mddev->del_work); - } + __mddev_put(mddev); spin_unlock(&all_mddevs_lock); } -- cgit From cf1b6d4441fffd0ba8ae4ced6a12f578c95ca049 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 27 Sep 2023 14:12:41 +0800 Subject: md: simplify md_seq_ops Before this patch, the implementation is hacky and hard to understand: 1) md_seq_start set pos to 1; 2) md_seq_show found pos is 1, then print Personalities; 3) md_seq_next found pos is 1, then it update pos to the first mddev; 4) md_seq_show found pos is not 1 or 2, show mddev; 5) md_seq_next found pos is not 1 or 2, update pos to next mddev; 6) loop 4-5 until the last mddev, then md_seq_next update pos to 2; 7) md_seq_show found pos is 2, then print unused devices; 8) md_seq_next found pos is 2, stop; This patch remove the magic value and use seq_list_start/next/stop() directly, and move printing "Personalities" to md_seq_start(), "unsed devices" to md_seq_stop(): 1) md_seq_start print Personalities, and then set pos to first mddev; 2) md_seq_show show mddev; 3) md_seq_next update pos to next mddev; 4) loop 2-3 until the last mddev; 5) md_seq_stop print unsed devices; Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230927061241.1552837-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 100 +++++++++++++------------------------------------------- 1 file changed, 22 insertions(+), 78 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index aad91cd72a06..38a6767c65b1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8213,105 +8213,46 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } static void *md_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(&all_mddevs_lock) { - struct list_head *tmp; - loff_t l = *pos; - struct mddev *mddev; + struct md_personality *pers; - if (l == 0x10000) { - ++*pos; - return (void *)2; - } - if (l > 0x10000) - return NULL; - if (!l--) - /* header */ - return (void*)1; + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); + seq->poll_event = atomic_read(&md_event_count); spin_lock(&all_mddevs_lock); - list_for_each(tmp,&all_mddevs) - if (!l--) { - mddev = list_entry(tmp, struct mddev, all_mddevs); - if (!mddev_get(mddev)) - continue; - spin_unlock(&all_mddevs_lock); - return mddev; - } - spin_unlock(&all_mddevs_lock); - if (!l--) - return (void*)2;/* tail */ - return NULL; + + return seq_list_start(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct list_head *tmp; - struct mddev *next_mddev, *mddev = v; - struct mddev *to_put = NULL; - - ++*pos; - if (v == (void*)2) - return NULL; - - spin_lock(&all_mddevs_lock); - if (v == (void*)1) { - tmp = all_mddevs.next; - } else { - to_put = mddev; - tmp = mddev->all_mddevs.next; - } - - for (;;) { - if (tmp == &all_mddevs) { - next_mddev = (void*)2; - *pos = 0x10000; - break; - } - next_mddev = list_entry(tmp, struct mddev, all_mddevs); - if (mddev_get(next_mddev)) - break; - mddev = next_mddev; - tmp = mddev->all_mddevs.next; - } - spin_unlock(&all_mddevs_lock); - - if (to_put) - mddev_put(to_put); - return next_mddev; - + return seq_list_next(v, &all_mddevs, pos); } static void md_seq_stop(struct seq_file *seq, void *v) + __releases(&all_mddevs_lock) { - struct mddev *mddev = v; - - if (mddev && v != (void*)1 && v != (void*)2) - mddev_put(mddev); + status_unused(seq); + spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = v; + struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); sector_t sectors; struct md_rdev *rdev; - if (v == (void*)1) { - struct md_personality *pers; - seq_printf(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_printf(seq, "\n"); - seq->poll_event = atomic_read(&md_event_count); + if (!mddev_get(mddev)) return 0; - } - if (v == (void*)2) { - status_unused(seq); - return 0; - } + spin_unlock(&all_mddevs_lock); spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -8382,6 +8323,9 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + spin_lock(&all_mddevs_lock); + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); return 0; } -- cgit From 09f894affcf2daac5aa841ffff43d1242215fd80 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Thu, 28 Sep 2023 14:55:17 +0200 Subject: md: do not require mddev_lock() for all options in array_state_store() We don't need to lock device to reject not supported request in array_state_store(). No functional changes intended. There are differences between ioctl and sysfs handling during stopping. With this change, it will be easier to add additional steps which needs to be completed before mddev_lock() is taken. Reviewed-by: Yu Kuai Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230928125517.12356-1-mariusz.tkaczyk@linux.intel.com --- drivers/md/md.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 38a6767c65b1..78853b05f99c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4415,6 +4415,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); + /* No lock dependent actions */ + switch (st) { + case suspended: /* not supported yet */ + case write_pending: /* cannot be set */ + case active_idle: /* cannot be set */ + case broken: /* cannot be set */ + case bad_word: + return -EINVAL; + default: + break; + } + if (mddev->pers && (st == active || st == clean) && mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between @@ -4439,23 +4451,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_lock(mddev); if (err) return err; - err = -EINVAL; - switch(st) { - case bad_word: - break; - case clear: - /* stopping an active array */ - err = do_md_stop(mddev, 0, NULL); - break; + + switch (st) { case inactive: - /* stopping an active array */ + /* stop an active array, return 0 otherwise */ if (mddev->pers) err = do_md_stop(mddev, 2, NULL); - else - err = 0; /* already inactive */ break; - case suspended: - break; /* not supported yet */ + case clear: + err = do_md_stop(mddev, 0, NULL); + break; case readonly: if (mddev->pers) err = md_set_readonly(mddev, NULL); @@ -4506,10 +4511,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = do_md_run(mddev); } break; - case write_pending: - case active_idle: - case broken: - /* these cannot be set */ + default: + err = -EINVAL; break; } -- cgit From 9e55a22fce1384837c213274d1a3b93be16ed9d7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 7 Oct 2023 19:21:05 +0800 Subject: md/raid1: don't split discard io for write behind Currently, discad io is treated the same as normal write io, and for write behind case, io size is limited to: BIO_MAX_VECS * (PAGE_SIZE >> 9) For 0.5KB sector size and 4KB PAGE_SIZE, this is just 1MB. For consequence, if 'WriteMostly' is set to one of the underlying disks, then diskcard io will be splited into 1MB and it will take a long time for the diskcard to finish. Fix this problem by disable write behind for discard io. Reported-by: Roman Mamedov Closes: https://lore.kernel.org/all/6a1165f7-c792-c054-b8f0-1ad4f7b8ae01@ultracoder.org/ Reported-and-tested-by: Kirill Kirilenko Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231007112105.407449-1-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3a78f79ee6d5..35d12948e0a9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; bool write_behind = false; + bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * write-mostly, which means we could allocate write behind * bio later. */ - if (rdev && test_bit(WriteMostly, &rdev->flags)) + if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags)) write_behind = true; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { -- cgit From 617787f1386da5e851dad68926a31522134d352b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:40 +0800 Subject: md: use READ_ONCE/WRITE_ONCE for 'suspend_lo' and 'suspend_hi' Protect 'suspend_lo' and 'suspend_hi' with READ_ONCE/WRITE_ONCE to prevent reading abnormal values. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 78853b05f99c..930bcabeec79 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -359,11 +359,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; if (bio_data_dir(bio) != WRITE) return false; - if (mddev->suspend_lo >= mddev->suspend_hi) + if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) return false; - if (bio_end_sector(bio) < mddev->suspend_lo) + if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) return false; return true; } @@ -5179,7 +5179,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t suspend_lo_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_lo)); } static ssize_t @@ -5199,7 +5200,7 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) return err; mddev_suspend(mddev); - mddev->suspend_lo = new; + WRITE_ONCE(mddev->suspend_lo, new); mddev_resume(mddev); mddev_unlock(mddev); @@ -5211,7 +5212,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); + return sprintf(page, "%llu\n", + (unsigned long long)READ_ONCE(mddev->suspend_hi)); } static ssize_t @@ -5231,7 +5233,7 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) return err; mddev_suspend(mddev); - mddev->suspend_hi = new; + WRITE_ONCE(mddev->suspend_hi, new); mddev_resume(mddev); mddev_unlock(mddev); -- cgit From 06a4d0d8c642b5ea654e832b74dca12965356da0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:41 +0800 Subject: md/raid5-cache: use READ_ONCE/WRITE_ONCE for 'conf->log' 'conf->log' is set with 'reconfig_mutex' grabbed, however, readers are not procted, hence protect it with READ_ONCE/WRITE_ONCE to prevent reading abnormal values. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-3-yukuai1@huaweicloud.com --- drivers/md/raid5-cache.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 518b7cfa78b9..889bba60d6ff 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space); void r5c_check_stripe_cache_usage(struct r5conf *conf) { int total_cached; + struct r5l_log *log = READ_ONCE(conf->log); - if (!r5c_is_writeback(conf->log)) + if (!r5c_is_writeback(log)) return; total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + @@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ if (total_cached > conf->min_nr_stripes * 1 / 2 || atomic_read(&conf->empty_inactive_list_nr) > 0) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf) */ void r5c_check_cached_full_stripe(struct r5conf *conf) { - if (!r5c_is_writeback(conf->log)) + struct r5l_log *log = READ_ONCE(conf->log); + + if (!r5c_is_writeback(log)) return; /* @@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) - r5l_wake_reclaim(conf->log, 0); + r5l_wake_reclaim(log, 0); } /* @@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!r5c_is_writeback(log)) return 0; @@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log) void r5c_make_stripe_write_out(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); BUG_ON(!r5c_is_writeback(log)); @@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh) */ static void r5c_finish_cache_stripe(struct stripe_head *sh) { - struct r5l_log *log = sh->raid_conf->log; + struct r5l_log *log = READ_ONCE(sh->raid_conf->log); if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); @@ -692,7 +695,7 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - conf->log == NULL || + !READ_ONCE(conf->log) || (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && (locked = mddev_trylock(mddev)))); if (locked) { @@ -1151,7 +1154,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) static sector_t r5c_calculate_new_cp(struct r5conf *conf) { struct stripe_head *sh; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t new_cp; unsigned long flags; @@ -1159,12 +1162,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf) return log->next_checkpoint; spin_lock_irqsave(&log->stripe_in_journal_lock, flags); - if (list_empty(&conf->log->stripe_in_journal_list)) { + if (list_empty(&log->stripe_in_journal_list)) { /* all stripes flushed */ spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); return log->next_checkpoint; } - sh = list_first_entry(&conf->log->stripe_in_journal_list, + sh = list_first_entry(&log->stripe_in_journal_list, struct stripe_head, r5c); new_cp = sh->log_start; spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -1399,7 +1402,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) struct stripe_head *sh, *next; lockdep_assert_held(&conf->device_lock); - if (!conf->log) + if (!READ_ONCE(conf->log)) return; count = 0; @@ -1420,7 +1423,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) static void r5c_do_reclaim(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); struct stripe_head *sh; int count = 0; unsigned long flags; @@ -1549,7 +1552,7 @@ static void r5l_reclaim_thread(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; @@ -1591,7 +1594,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); /* don't allow write if journal disk is missing */ if (!log) @@ -2635,7 +2638,7 @@ int r5c_try_caching_write(struct r5conf *conf, struct stripe_head_state *s, int disks) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; struct r5dev *dev; int to_cache = 0; @@ -2802,7 +2805,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); int i; int do_wakeup = 0; sector_t tree_index; @@ -2941,7 +2944,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) /* check whether this big stripe is in write back cache. */ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) { - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); sector_t tree_index; void *slot; @@ -3049,14 +3052,14 @@ int r5l_start(struct r5l_log *log) void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; + struct r5l_log *log = READ_ONCE(conf->log); if (!log) return; if ((raid5_calc_degraded(conf) > 0 || test_bit(Journal, &rdev->flags)) && - conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } @@ -3145,7 +3148,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - conf->log = log; + WRITE_ONCE(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3173,7 +3176,7 @@ void r5l_exit_log(struct r5conf *conf) * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to * ensure disable_writeback_work wakes up and exits. */ - conf->log = NULL; + WRITE_ONCE(conf->log, NULL); wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); -- cgit From 2e82248b70f48a98f3720cdaa8ea7a4e7f8bd760 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:42 +0800 Subject: md: replace is_md_suspended() with 'mddev->suspended' in md_check_recovery() Prepare to cleanup pers->prepare_suspend(), which is used to fix a deadlock in raid456 by returning error for io that is waiting for reshape to make progress in mddev_suspend(). This change will allow reshape to make progress while waiting for io to be done in mddev_suspend() in following patches. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 930bcabeec79..cf929c42cdbe 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9418,7 +9418,7 @@ void md_check_recovery(struct mddev *mddev) wake_up(&mddev->sb_wait); } - if (is_md_suspended(mddev)) + if (READ_ONCE(mddev->suspended)) return; if (mddev->bitmap) -- cgit From 714d20150ed85811193ae07a494d91f9927c590f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:43 +0800 Subject: md: add new helpers to suspend/resume array Advantages for new apis: - reconfig_mutex is not required; - the weird logical that suspend array hold 'reconfig_mutex' for mddev_check_recovery() to update superblock is not needed; - the specail handling, 'pers->prepare_suspend', for raid456 is not needed; - It's safe to be called at any time once mddev is allocated, and it's designed to be used from slow path where array configuration is changed; - the new helpers is designed to be called before mddev_lock(), hence it support to be interrupted by user as well. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- drivers/md/md.h | 3 ++ 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index cf929c42cdbe..201de29d913c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -443,12 +443,22 @@ void mddev_suspend(struct mddev *mddev) lockdep_is_held(&mddev->reconfig_mutex)); WARN_ON_ONCE(thread && current == thread->tsk); - if (mddev->suspended++) + + /* can't concurrent with __mddev_suspend() and __mddev_resume() */ + mutex_lock(&mddev->suspend_mutex); + if (mddev->suspended++) { + mutex_unlock(&mddev->suspend_mutex); return; + } + wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + /* + * TODO: cleanup 'pers->prepare_suspend after all callers are replaced + * by __mddev_suspend(). + */ if (mddev->pers && mddev->pers->prepare_suspend) mddev->pers->prepare_suspend(mddev); @@ -459,14 +469,21 @@ void mddev_suspend(struct mddev *mddev) del_timer_sync(&mddev->safemode_timer); /* restrict memory reclaim I/O during raid array is suspend */ mddev->noio_flag = memalloc_noio_save(); + + mutex_unlock(&mddev->suspend_mutex); } EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { lockdep_assert_held(&mddev->reconfig_mutex); - if (--mddev->suspended) + + /* can't concurrent with __mddev_suspend() and __mddev_resume() */ + mutex_lock(&mddev->suspend_mutex); + if (--mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); return; + } /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); @@ -477,9 +494,89 @@ void mddev_resume(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); } EXPORT_SYMBOL_GPL(mddev_resume); +int __mddev_suspend(struct mddev *mddev, bool interruptible) +{ + int err = 0; + + /* + * hold reconfig_mutex to wait for normal io will deadlock, because + * other context can't update super_block, and normal io can rely on + * updating super_block. + */ + lockdep_assert_not_held(&mddev->reconfig_mutex); + + if (interruptible) + err = mutex_lock_interruptible(&mddev->suspend_mutex); + else + mutex_lock(&mddev->suspend_mutex); + if (err) + return err; + + if (mddev->suspended) { + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + mutex_unlock(&mddev->suspend_mutex); + return 0; + } + + percpu_ref_kill(&mddev->active_io); + if (interruptible) + err = wait_event_interruptible(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + else + wait_event(mddev->sb_wait, + percpu_ref_is_zero(&mddev->active_io)); + if (err) { + percpu_ref_resurrect(&mddev->active_io); + mutex_unlock(&mddev->suspend_mutex); + return err; + } + + /* + * For raid456, io might be waiting for reshape to make progress, + * allow new reshape to start while waiting for io to be done to + * prevent deadlock. + */ + WRITE_ONCE(mddev->suspended, mddev->suspended + 1); + + del_timer_sync(&mddev->safemode_timer); + /* restrict memory reclaim I/O during raid array is suspend */ + mddev->noio_flag = memalloc_noio_save(); + + mutex_unlock(&mddev->suspend_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__mddev_suspend); + +void __mddev_resume(struct mddev *mddev) +{ + lockdep_assert_not_held(&mddev->reconfig_mutex); + + mutex_lock(&mddev->suspend_mutex); + WRITE_ONCE(mddev->suspended, mddev->suspended - 1); + if (mddev->suspended) { + mutex_unlock(&mddev->suspend_mutex); + return; + } + + /* entred the memalloc scope from __mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); + + percpu_ref_resurrect(&mddev->active_io); + wake_up(&mddev->sb_wait); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + + mutex_unlock(&mddev->suspend_mutex); +} +EXPORT_SYMBOL_GPL(__mddev_resume); + /* * Generic flush handling for md */ @@ -672,6 +769,7 @@ int mddev_init(struct mddev *mddev) mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->sync_mutex); + mutex_init(&mddev->suspend_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); diff --git a/drivers/md/md.h b/drivers/md/md.h index b628c292506e..b5894dc64615 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -316,6 +316,7 @@ struct mddev { unsigned long sb_flags; int suspended; + struct mutex suspend_mutex; struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes @@ -811,6 +812,8 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); +extern int __mddev_suspend(struct mddev *mddev, bool interruptible); +extern void __mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); -- cgit From f45461e24febab5c2ba8f331e9eb109bb59cbdc1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:44 +0800 Subject: md: add new helpers to suspend/resume and lock/unlock array The new helpers suspend the array first and then lock the array, Prepare to refactor from: mddev_lock/lock_nointr mddev_suspend ... mddev_resuem mddev_lock With: mddev_suspend_and_lock/lock_nointr ... mddev_unlock_and_resume After all the use cases is refactored, mddev_suspend/resume() will be removed. And mddev_suspend_and_lock() will also replace mddev_lock() for the case that the array will be reconfigured, in order to synchronize with io to prevent problems in many corner cases. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-6-yukuai1@huaweicloud.com --- drivers/md/md.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/md/md.h b/drivers/md/md.h index b5894dc64615..5c8f3f045e78 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -858,6 +858,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio mddev->queue->limits.max_write_zeroes_sectors = 0; } +static inline int mddev_suspend_and_lock(struct mddev *mddev) +{ + int ret; + + ret = __mddev_suspend(mddev, true); + if (ret) + return ret; + + ret = mddev_lock(mddev); + if (ret) + __mddev_resume(mddev); + + return ret; +} + +static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) +{ + __mddev_suspend(mddev, false); + mutex_lock(&mddev->reconfig_mutex); +} + +static inline void mddev_unlock_and_resume(struct mddev *mddev) +{ + mddev_unlock(mddev); + __mddev_resume(mddev); +} + struct mdu_array_info_s; struct mdu_disk_info_s; -- cgit From 4eb3327aa28f3a737c2d3f7e35e83575f1d52283 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:45 +0800 Subject: md/dm-raid: use new apis to suspend array Convert to use new apis, the old apis will be removed eventually. These are not hot path, so performance is not concerned. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-7-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 69805d37e113..05dd6ccf6f48 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3244,7 +3244,7 @@ size_check: set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); /* Has to be held on running the array */ - mddev_lock_nointr(&rs->md); + mddev_suspend_and_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3268,7 +3268,6 @@ size_check: } } - mddev_suspend(&rs->md); set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags); /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ @@ -3798,9 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti) if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) md_stop_writes(&rs->md); - mddev_lock_nointr(&rs->md); - mddev_suspend(&rs->md); - mddev_unlock(&rs->md); + __mddev_suspend(&rs->md, false); } } @@ -4012,7 +4009,7 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - /* Be prepared for mddev_resume() in raid_resume() */ + /* Be prepared for __mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -4059,8 +4056,7 @@ static void raid_resume(struct dm_target *ti) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; - mddev_resume(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } } -- cgit From 3cddf86a57a58441eb019d21acea147f043dc13c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:46 +0800 Subject: md/md-bitmap: use new apis to suspend array for location_store() Convert to use new apis, the old apis will be removed eventually. This is not hot path, so performance is not concerned. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-8-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 0c661e5036bb..7d21e2a5b06e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2348,11 +2348,10 @@ location_store(struct mddev *mddev, const char *buf, size_t len) { int rv; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; - mddev_suspend(mddev); if (mddev->pers) { if (mddev->recovery || mddev->sync_thread) { rv = -EBUSY; @@ -2429,8 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } rv = 0; out: - mddev_resume(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (rv) return rv; return len; -- cgit From 1b172e0b11c00e89c5df72c2761b3d4d279fbb4d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:47 +0800 Subject: md/raid5-cache: use new apis to suspend array Convert to use new apis, the old apis will be removed eventually. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-9-yukuai1@huaweicloud.com --- drivers/md/raid5-cache.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 889bba60d6ff..9909110262ee 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -686,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work) disable_writeback_work); struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; - int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -696,13 +695,13 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, !READ_ONCE(conf->log) || - (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && - (locked = mddev_trylock(mddev)))); - if (locked) { - mddev_suspend(mddev); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + + log = READ_ONCE(conf->log); + if (log) { + __mddev_suspend(mddev, false); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); - mddev_unlock(mddev); + __mddev_resume(mddev); } } @@ -2586,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode) mode == R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; - mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; - mddev_resume(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); @@ -2613,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev, if (strlen(r5c_journal_mode_str[mode]) == len && !strncmp(page, r5c_journal_mode_str[mode], len)) break; - ret = mddev_lock(mddev); + ret = mddev_suspend_and_lock(mddev); if (ret) return ret; ret = r5c_journal_mode_set(mddev, mode); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return ret ?: length; } -- cgit From e28ca92fbb5c6ed8fac5db65e02947bf020c5b60 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:48 +0800 Subject: md/raid5: use new apis to suspend array Convert to use new apis, the old apis will be removed eventually. These are not hot path, so performance is not concerned. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-10-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6383723468e5..e6b8c0145648 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7025,7 +7025,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) new != roundup_pow_of_two(new)) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; @@ -7049,7 +7049,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) goto out_unlock; } - mddev_suspend(mddev); mutex_lock(&conf->cache_size_mutex); size = conf->max_nr_stripes; @@ -7064,10 +7063,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) err = -ENOMEM; } mutex_unlock(&conf->cache_size_mutex); - mddev_resume(mddev); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7153,7 +7151,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) return -EINVAL; new = !!new; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; @@ -7162,15 +7160,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) else if (new != conf->skip_copy) { struct request_queue *q = mddev->queue; - mddev_suspend(mddev); conf->skip_copy = new; if (new) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -7225,15 +7221,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (new > 8192) return -EINVAL; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) err = -ENODEV; else if (new != conf->worker_cnt_per_group) { - mddev_suspend(mddev); - old_groups = conf->worker_groups; if (old_groups) flush_workqueue(raid5_wq); @@ -7250,9 +7244,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) kfree(old_groups[0].workers); kfree(old_groups); } - mddev_resume(mddev); } - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } @@ -8974,12 +8967,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) struct r5conf *conf; int err; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; conf = mddev->private; if (!conf) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return -ENODEV; } @@ -8989,19 +8982,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) { - mddev_suspend(mddev); + if (err) log_exit(conf); - mddev_resume(mddev); - } } } else err = -EINVAL; } else if (strncmp(buf, "resync", 6) == 0) { if (raid5_has_ppl(conf)) { - mddev_suspend(mddev); log_exit(conf); - mddev_resume(mddev); err = resize_stripes(conf, conf->pool_size); } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && r5l_log_disk_error(conf)) { @@ -9014,11 +9002,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) break; } - if (!journal_dev_exists) { - mddev_suspend(mddev); + if (!journal_dev_exists) clear_bit(MD_HAS_JOURNAL, &mddev->flags); - mddev_resume(mddev); - } else /* need remove journal device first */ + else /* need remove journal device first */ err = -EBUSY; } else err = -EINVAL; @@ -9029,7 +9015,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) if (!err) md_update_sb(mddev, 1); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err; } -- cgit From 205669f37770772c1ae8c2dac1eba6da521a8b77 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:49 +0800 Subject: md: use new apis to suspend array for sysfs apis Convert to use new apis in following sysfs apis: - level_store - suspend_lo_store - suspend_hi_store - serialize_policy_store These are not hot path, so performance is not concerned. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-11-yukuai1@huaweicloud.com --- drivers/md/md.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 201de29d913c..aa08b9b78332 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4019,7 +4019,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (slen == 0 || slen >= sizeof(clevel)) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; @@ -4112,7 +4112,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } /* Looks like we have a winner */ - mddev_suspend(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); @@ -4198,14 +4197,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len) blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(); rv = len; out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return rv; } @@ -5293,15 +5291,13 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = __mddev_suspend(mddev, true); if (err) return err; - mddev_suspend(mddev); WRITE_ONCE(mddev->suspend_lo, new); - mddev_resume(mddev); + __mddev_resume(mddev); - mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_suspend_lo = @@ -5326,15 +5322,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = mddev_lock(mddev); + err = __mddev_suspend(mddev, true); if (err) return err; - mddev_suspend(mddev); WRITE_ONCE(mddev->suspend_hi, new); - mddev_resume(mddev); + __mddev_resume(mddev); - mddev_unlock(mddev); return len; } static struct md_sysfs_entry md_suspend_hi = @@ -5582,7 +5576,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (value == mddev->serialize_policy) return len; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->pers == NULL || (mddev->pers->level != 1)) { @@ -5591,15 +5585,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - mddev_suspend(mddev); if (value) mddev_create_serial_pool(mddev, NULL, true); else mddev_destroy_serial_pool(mddev, NULL, true); mddev->serialize_policy = value; - mddev_resume(mddev); unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return err ?: len; } -- cgit From cfa078c8b80d0daf8f2fd4a2ab8e26fa8c33bca1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:50 +0800 Subject: md: use new apis to suspend array for adding/removing rdev from state_store() User can write 'remove' and 're-add' to trigger array reconfiguration through sysfs, suspend array in this case so that io won't concurrent with array reconfiguration. And now that all the caller of add_bound_rdev() alread suspend the array, remove mddev_suspend/resume() from add_bound_rdev() as well. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-12-yukuai1@huaweicloud.com --- drivers/md/md.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index aa08b9b78332..56523bac5140 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2940,11 +2940,7 @@ static int add_bound_rdev(struct md_rdev *rdev) */ super_types[mddev->major_version]. validate_super(mddev, rdev); - if (add_journal) - mddev_suspend(mddev); err = mddev->pers->hot_add_disk(mddev, rdev); - if (add_journal) - mddev_resume(mddev); if (err) { md_kick_rdev_from_array(rdev); return err; @@ -3697,6 +3693,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct kernfs_node *kn = NULL; + bool suspend = false; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3704,17 +3701,23 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + if (!mddev) + return -ENODEV; - if (entry->store == state_store && cmd_match(page, "remove")) - kn = sysfs_break_active_protection(kobj, attr); + if (entry->store == state_store) { + if (cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + if (cmd_match(page, "remove") || cmd_match(page, "re-add")) + suspend = true; + } - rv = mddev ? mddev_lock(mddev) : -ENODEV; + rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); if (!rv) { if (rdev->mddev == NULL) rv = -ENODEV; else rv = entry->store(rdev, page, length); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); } if (kn) -- cgit From 1b0a2d950ee2a54aa04fb31ead32144be0bbf690 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:51 +0800 Subject: md: use new apis to suspend array for ioctls involed array reconfiguration 'reconfig_mutex' will be grabbed before these ioctls, suspend array before holding the lock, so that io won't concurrent with array reconfiguration through ioctls. This is not hot path, so performance is not concerned. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-13-yukuai1@huaweicloud.com --- drivers/md/md.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 56523bac5140..ee20e6a4b998 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7215,7 +7215,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = md_bitmap_load(mddev); @@ -7225,11 +7224,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) md_bitmap_destroy(mddev); fd = -1; } - mddev_resume(mddev); } else if (fd < 0) { - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); } } if (fd < 0) { @@ -7518,7 +7514,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = md_bitmap_create(mddev, -1); - mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = md_bitmap_load(mddev); @@ -7526,7 +7521,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) md_bitmap_destroy(mddev); - mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -7551,9 +7545,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } @@ -7624,6 +7616,20 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static bool md_ioctl_need_suspend(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case SET_BITMAP_FILE: + case SET_ARRAY_INFO: + return true; + default: + return false; + } +} + static int __md_set_array_info(struct mddev *mddev, void __user *argp) { mdu_array_info_t info; @@ -7756,7 +7762,8 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, if (!md_is_rdwr(mddev)) flush_work(&mddev->sync_work); - err = mddev_lock(mddev); + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : + mddev_lock(mddev); if (err) { pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); @@ -7884,7 +7891,10 @@ unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; - mddev_unlock(mddev); + + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : + mddev_unlock(mddev); + out: if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); -- cgit From 58226942ad3d1423785f815ede5bcc832574f2ea Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:52 +0800 Subject: md: use new apis to suspend array before mddev_create/destroy_serial_pool mddev_create/destroy_serial_pool() will be called from several places where mddev_suspend() will be called later. Prepare to remove the mddev_suspend() from mddev_create/destroy_serial_pool(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-14-yukuai1@huaweicloud.com --- drivers/md/md-autodetect.c | 4 ++-- drivers/md/md-bitmap.c | 8 ++++---- drivers/md/md.c | 22 ++++++++++++---------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 6eaa0eab40f9..4b80165afd23 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args) return; } - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) { pr_err("md: failed to lock array %s\n", name); goto out_mddev_put; @@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args) if (err) pr_warn("md: starting %s failed\n", name); out_unlock: - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); out_mddev_put: mddev_put(mddev); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7d21e2a5b06e..b3d701c5c461 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2537,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; - rv = mddev_lock(mddev); + rv = mddev_suspend_and_lock(mddev); if (rv) return rv; @@ -2562,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, false); + mddev_destroy_serial_pool(mddev, NULL, true); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, true); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return len; } diff --git a/drivers/md/md.c b/drivers/md/md.c index ee20e6a4b998..85e263376be9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2557,7 +2557,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_serial_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, true); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -3077,11 +3077,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_serial_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev, true); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev, true); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; @@ -3707,7 +3707,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->store == state_store) { if (cmd_match(page, "remove")) kn = sysfs_break_active_protection(kobj, attr); - if (cmd_match(page, "remove") || cmd_match(page, "re-add")) + if (cmd_match(page, "remove") || cmd_match(page, "re-add") || + cmd_match(page, "writemostly") || + cmd_match(page, "-writemostly")) suspend = true; } @@ -4684,7 +4686,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - err = mddev_lock(mddev); + err = mddev_suspend_and_lock(mddev); if (err) return err; if (mddev->persistent) { @@ -4705,14 +4707,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); return PTR_ERR(rdev); } err = bind_rdev_to_array(rdev, mddev); out: if (err) export_rdev(rdev, mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); if (!err) md_new_event(); return err ? err : len; @@ -6649,13 +6651,13 @@ static void autorun_devices(int part) if (IS_ERR(mddev)) break; - if (mddev_lock(mddev)) + if (mddev_suspend_and_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { pr_warn("md: %s already running, cannot run %pg\n", mdname(mddev), rdev0->bdev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } else { pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; @@ -6665,7 +6667,7 @@ static void autorun_devices(int part) export_rdev(rdev, mddev); } autorun_array(mddev); - mddev_unlock(mddev); + mddev_unlock_and_resume(mddev); } /* on success, candidates will be empty, on error * it won't... -- cgit From b4128c00a653dfd08fbe3d26fcf4c8b4970a69ba Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:53 +0800 Subject: md: cleanup mddev_create/destroy_serial_pool() Now that except for stopping the array, all the callers already suspend the array, there is no need to suspend anymore, hence remove the second parameter. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-15-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 8 ++++---- drivers/md/md.c | 33 ++++++++++----------------------- drivers/md/md.h | 7 +++---- 3 files changed, 17 insertions(+), 31 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b3d701c5c461..9672f75c3050 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev) md_bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2562,11 +2562,11 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ if (!mddev->serialize_policy) - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ rdev_for_each(rdev, mddev) - mddev_create_serial_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 85e263376be9..77ba8b265e16 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -206,8 +206,7 @@ static int rdev_need_serial(struct md_rdev *rdev) * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs. */ -void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { int ret = 0; @@ -215,15 +214,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, !test_bit(CollisionCheck, &rdev->flags)) return; - if (!is_suspend) - mddev_suspend(mddev); - if (!rdev) ret = rdevs_init_serial(mddev); else ret = rdev_init_serial(rdev); if (ret) - goto abort; + return; if (mddev->serial_info_pool == NULL) { /* @@ -238,10 +234,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, pr_err("can't alloc memory pool for serialization\n"); } } - -abort: - if (!is_suspend) - mddev_resume(mddev); } /* @@ -250,8 +242,7 @@ abort: * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) { if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; @@ -260,8 +251,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */ - if (!is_suspend) - mddev_suspend(mddev); rdev_for_each(temp, mddev) { if (!rdev) { if (!mddev->serialize_policy || @@ -283,8 +272,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, mempool_destroy(mddev->serial_info_pool); mddev->serial_info_pool = NULL; } - if (!is_suspend) - mddev_resume(mddev); } } @@ -2557,7 +2544,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_serial_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2610,7 +2597,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); - mddev_destroy_serial_pool(rdev->mddev, rdev, false); + mddev_destroy_serial_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -3077,11 +3064,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_serial_pool(rdev->mddev, rdev, true); + mddev_create_serial_pool(rdev->mddev, rdev); need_update_sb = true; err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_serial_pool(rdev->mddev, rdev, true); + mddev_destroy_serial_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); need_update_sb = true; err = 0; @@ -5591,9 +5578,9 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) } if (value) - mddev_create_serial_pool(mddev, NULL, true); + mddev_create_serial_pool(mddev, NULL); else - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); mddev->serialize_policy = value; unlock: mddev_unlock_and_resume(mddev); @@ -6359,7 +6346,7 @@ static void __md_stop_writes(struct mddev *mddev) } /* disable policy to guarantee rdevs free resources for serialization */ mddev->serialize_policy = 0; - mddev_destroy_serial_pool(mddev, NULL, true); + mddev_destroy_serial_pool(mddev, NULL); } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 5c8f3f045e78..63b4c393b1ee 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -817,10 +817,9 @@ extern void __mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); -extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); -extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); +extern void mddev_destroy_serial_pool(struct mddev *mddev, + struct md_rdev *rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); -- cgit From 1978c742f3e7ef359b83cb712f8559fcd32032d0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:54 +0800 Subject: md/md-linear: cleanup linear_add() Now that caller already suspend the array, there is no need to suspend array in liner_add(). Note that mddev_suspend/resume() is not used anymore. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-16-yukuai1@huaweicloud.com --- drivers/md/md-linear.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index ae2826e9645b..8eca7693b793 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) * in linear_congested(), therefore kfree_rcu() is used to free * oldconf until no one uses it anymore. */ - mddev_suspend(mddev); oldconf = rcu_dereference_protected(mddev->private, lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; @@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - mddev_resume(mddev); kfree_rcu(oldconf, rcu); return 0; } -- cgit From b42cd7b3a20db692a8d24c2c756ad5008729976b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:55 +0800 Subject: md/raid5: replace suspend with quiesce() callback raid5 is the only personality to suspend array in check_reshape() and start_reshape() callback, suspend and quiesce() callback can both wait for all normal io to be done, and prevent new io to be dispatched, the difference is that suspend is implemented in common layer, and quiesce() callback is implemented in raid5. In order to cleanup all the usage of mddev_suspend(), the new apis __mddev_suspend() need to be called before 'reconfig_mutex' is held, and it's not good to affect all the personalities in common layer just for raid5. Hence replace suspend with quiesce() callaback, prepare to reomove all the users of mddev_suspend(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-17-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e6b8c0145648..d6de084a85e5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; +static void raid5_quiesce(struct mddev *mddev, int quiesce); + static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; @@ -2492,15 +2494,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) unsigned long cpu; int err = 0; - /* - * Never shrink. And mddev_suspend() could deadlock if this is called - * from raid5d. In that case, scribble_disks and scribble_sectors - * should equal to new_disks and new_sectors - */ + /* Never shrink. */ if (conf->scribble_disks >= new_disks && conf->scribble_sectors >= new_sectors) return 0; - mddev_suspend(conf->mddev); + + raid5_quiesce(conf->mddev, true); cpus_read_lock(); for_each_present_cpu(cpu) { @@ -2514,7 +2513,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) } cpus_read_unlock(); - mddev_resume(conf->mddev); + raid5_quiesce(conf->mddev, false); + if (!err) { conf->scribble_disks = new_disks; conf->scribble_sectors = new_sectors; @@ -8551,8 +8551,8 @@ static int raid5_start_reshape(struct mddev *mddev) * the reshape wasn't running - like Discard or Read - have * completed. */ - mddev_suspend(mddev); - mddev_resume(mddev); + raid5_quiesce(mddev, true); + raid5_quiesce(mddev, false); /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. -- cgit From bc08041b32abe6c9824f78735bac22018eabfc06 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:56 +0800 Subject: md: suspend array in md_start_sync() if array need reconfiguration So that io won't concurrent with array reconfiguration, and it's safe to suspend the array directly because normal io won't rely on md_start_sync(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-18-yukuai1@huaweicloud.com --- drivers/md/md.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 77ba8b265e16..42f5fbde4e89 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9417,8 +9417,13 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, sync_work); int spares = 0; + bool suspend = false; - mddev_lock_nointr(mddev); + if (md_spares_need_change(mddev)) + suspend = true; + + suspend ? mddev_suspend_and_lock_nointr(mddev) : + mddev_lock_nointr(mddev); if (!md_is_rdwr(mddev)) { /* @@ -9454,7 +9459,7 @@ static void md_start_sync(struct work_struct *ws) goto not_running; } - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); @@ -9466,7 +9471,7 @@ not_running: clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev_unlock(mddev); + suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); wake_up(&resync_wait); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && -- cgit From 4717c02875221d43d73ffed07f960d881bc00fc8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:57 +0800 Subject: md: remove old apis to suspend the array Now that mddev_suspend() and mddev_resume() is not used anywhere, remove them, and remove 'MD_ALLOW_SB_UPDATE' and 'MD_UPDATING_SB' as well. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-19-yukuai1@huaweicloud.com --- drivers/md/md.c | 82 +++------------------------------------------------------ drivers/md/md.h | 8 ------ 2 files changed, 3 insertions(+), 87 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 42f5fbde4e89..757cd53c0fa1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -418,74 +418,10 @@ static void md_submit_bio(struct bio *bio) md_handle_request(mddev, bio); } -/* mddev_suspend makes sure no new requests are submitted - * to the device, and that any requests that have been submitted - * are completely handled. - * Once mddev_detach() is called and completes, the module will be - * completely unused. +/* + * Make sure no new requests are submitted to the device, and any requests that + * have been submitted are completely handled. */ -void mddev_suspend(struct mddev *mddev) -{ - struct md_thread *thread = rcu_dereference_protected(mddev->thread, - lockdep_is_held(&mddev->reconfig_mutex)); - - WARN_ON_ONCE(thread && current == thread->tsk); - - /* can't concurrent with __mddev_suspend() and __mddev_resume() */ - mutex_lock(&mddev->suspend_mutex); - if (mddev->suspended++) { - mutex_unlock(&mddev->suspend_mutex); - return; - } - - wake_up(&mddev->sb_wait); - set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - percpu_ref_kill(&mddev->active_io); - - /* - * TODO: cleanup 'pers->prepare_suspend after all callers are replaced - * by __mddev_suspend(). - */ - if (mddev->pers && mddev->pers->prepare_suspend) - mddev->pers->prepare_suspend(mddev); - - wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); - clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); - wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); - - del_timer_sync(&mddev->safemode_timer); - /* restrict memory reclaim I/O during raid array is suspend */ - mddev->noio_flag = memalloc_noio_save(); - - mutex_unlock(&mddev->suspend_mutex); -} -EXPORT_SYMBOL_GPL(mddev_suspend); - -void mddev_resume(struct mddev *mddev) -{ - lockdep_assert_held(&mddev->reconfig_mutex); - - /* can't concurrent with __mddev_suspend() and __mddev_resume() */ - mutex_lock(&mddev->suspend_mutex); - if (--mddev->suspended) { - mutex_unlock(&mddev->suspend_mutex); - return; - } - - /* entred the memalloc scope from mddev_suspend() */ - memalloc_noio_restore(mddev->noio_flag); - - percpu_ref_resurrect(&mddev->active_io); - wake_up(&mddev->sb_wait); - - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); - md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - - mutex_unlock(&mddev->suspend_mutex); -} -EXPORT_SYMBOL_GPL(mddev_resume); - int __mddev_suspend(struct mddev *mddev, bool interruptible) { int err = 0; @@ -9503,18 +9439,6 @@ not_running: */ void md_check_recovery(struct mddev *mddev) { - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); - } - if (READ_ONCE(mddev->suspended)) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 63b4c393b1ee..4c5f3f032656 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -248,10 +248,6 @@ struct md_cluster_info; * become failed. * @MD_HAS_PPL: The raid array has PPL feature set. * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set. - * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata - * without taking reconfig_mutex. - * @MD_UPDATING_SB: md_check_recovery is updating the metadata without - * explicitly holding reconfig_mutex. * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. @@ -268,8 +264,6 @@ enum mddev_flags { MD_FAILFAST_SUPPORTED, MD_HAS_PPL, MD_HAS_MULTIPLE_PPLS, - MD_ALLOW_SB_UPDATE, - MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, MD_DELETED, @@ -810,8 +804,6 @@ extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); -extern void mddev_suspend(struct mddev *mddev); -extern void mddev_resume(struct mddev *mddev); extern int __mddev_suspend(struct mddev *mddev, bool interruptible); extern void __mddev_resume(struct mddev *mddev); -- cgit From 2b16a52549d51937a98d82b07b4d83dce6c43683 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Oct 2023 23:19:58 +0800 Subject: md: rename __mddev_suspend/resume() back to mddev_suspend/resume() Now that the old apis are removed, __mddev_suspend/resume() can be renamed to their original names. This is done by: sed -i "s/__mddev_suspend/mddev_suspend/g" *.[ch] sed -i "s/__mddev_resume/mddev_resume/g" *.[ch] Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231010151958.145896-20-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 4 ++-- drivers/md/md.c | 18 +++++++++--------- drivers/md/md.h | 12 ++++++------ drivers/md/raid5-cache.c | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 05dd6ccf6f48..a4692f8f98ee 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3797,7 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti) if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) md_stop_writes(&rs->md); - __mddev_suspend(&rs->md, false); + mddev_suspend(&rs->md, false); } } @@ -4009,7 +4009,7 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - /* Be prepared for __mddev_resume() in raid_resume() */ + /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); diff --git a/drivers/md/md.c b/drivers/md/md.c index 757cd53c0fa1..8ee079c4dc1e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -422,7 +422,7 @@ static void md_submit_bio(struct bio *bio) * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled. */ -int __mddev_suspend(struct mddev *mddev, bool interruptible) +int mddev_suspend(struct mddev *mddev, bool interruptible) { int err = 0; @@ -473,9 +473,9 @@ int __mddev_suspend(struct mddev *mddev, bool interruptible) mutex_unlock(&mddev->suspend_mutex); return 0; } -EXPORT_SYMBOL_GPL(__mddev_suspend); +EXPORT_SYMBOL_GPL(mddev_suspend); -void __mddev_resume(struct mddev *mddev) +void mddev_resume(struct mddev *mddev) { lockdep_assert_not_held(&mddev->reconfig_mutex); @@ -486,7 +486,7 @@ void __mddev_resume(struct mddev *mddev) return; } - /* entred the memalloc scope from __mddev_suspend() */ + /* entred the memalloc scope from mddev_suspend() */ memalloc_noio_restore(mddev->noio_flag); percpu_ref_resurrect(&mddev->active_io); @@ -498,7 +498,7 @@ void __mddev_resume(struct mddev *mddev) mutex_unlock(&mddev->suspend_mutex); } -EXPORT_SYMBOL_GPL(__mddev_resume); +EXPORT_SYMBOL_GPL(mddev_resume); /* * Generic flush handling for md @@ -5219,12 +5219,12 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = __mddev_suspend(mddev, true); + err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_lo, new); - __mddev_resume(mddev); + mddev_resume(mddev); return len; } @@ -5250,12 +5250,12 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (new != (sector_t)new) return -EINVAL; - err = __mddev_suspend(mddev, true); + err = mddev_suspend(mddev, true); if (err) return err; WRITE_ONCE(mddev->suspend_hi, new); - __mddev_resume(mddev); + mddev_resume(mddev); return len; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 4c5f3f032656..55d01d431418 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -804,8 +804,8 @@ extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); extern void md_handle_request(struct mddev *mddev, struct bio *bio); -extern int __mddev_suspend(struct mddev *mddev, bool interruptible); -extern void __mddev_resume(struct mddev *mddev); +extern int mddev_suspend(struct mddev *mddev, bool interruptible); +extern void mddev_resume(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); @@ -853,27 +853,27 @@ static inline int mddev_suspend_and_lock(struct mddev *mddev) { int ret; - ret = __mddev_suspend(mddev, true); + ret = mddev_suspend(mddev, true); if (ret) return ret; ret = mddev_lock(mddev); if (ret) - __mddev_resume(mddev); + mddev_resume(mddev); return ret; } static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev) { - __mddev_suspend(mddev, false); + mddev_suspend(mddev, false); mutex_lock(&mddev->reconfig_mutex); } static inline void mddev_unlock_and_resume(struct mddev *mddev) { mddev_unlock(mddev); - __mddev_resume(mddev); + mddev_resume(mddev); } struct mdu_array_info_s; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9909110262ee..6157f5beb9fe 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -699,9 +699,9 @@ static void r5c_disable_writeback_async(struct work_struct *work) log = READ_ONCE(conf->log); if (log) { - __mddev_suspend(mddev, false); + mddev_suspend(mddev, false); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - __mddev_resume(mddev); + mddev_resume(mddev); } } -- cgit