aboutsummaryrefslogtreecommitdiff
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c227
1 files changed, 138 insertions, 89 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59f91e392a2a..39038fa8b1c8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+ __acquires(&conf->device_lock)
{
spin_lock_irq(conf->hash_locks + hash);
spin_lock(&conf->device_lock);
}
static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+ __releases(&conf->device_lock)
{
spin_unlock(&conf->device_lock);
spin_unlock_irq(conf->hash_locks + hash);
}
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+ __acquires(&conf->device_lock)
{
int i;
spin_lock_irq(conf->hash_locks);
@@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
}
static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+ __releases(&conf->device_lock)
{
int i;
spin_unlock(&conf->device_lock);
@@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh)
}
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+ __must_hold(&sh->raid_conf->device_lock)
{
struct r5conf *conf = sh->raid_conf;
struct r5worker_group *group;
@@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
int i;
int injournal = 0; /* number of date pages with R5_InJournal */
@@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
if (atomic_dec_and_test(&sh->count))
do_release_stripe(conf, sh, temp_inactive_list);
@@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf,
}
}
-/* should hold conf->device_lock already */
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *t;
int count = 0;
@@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
+ *
+ * Most calls to this function hold &conf->device_lock. Calls
+ * in raid5_run() do not require the lock as no other threads
+ * have been started yet.
*/
int raid5_calc_degraded(struct r5conf *conf)
{
@@ -686,17 +697,17 @@ int raid5_calc_degraded(struct r5conf *conf)
return degraded;
}
-static int has_failed(struct r5conf *conf)
+static bool has_failed(struct r5conf *conf)
{
- int degraded;
+ int degraded = conf->mddev->degraded;
- if (conf->mddev->reshape_position == MaxSector)
- return conf->mddev->degraded > conf->max_degraded;
+ if (test_bit(MD_BROKEN, &conf->mddev->flags))
+ return true;
- degraded = raid5_calc_degraded(conf);
- if (degraded > conf->max_degraded)
- return 1;
- return 0;
+ if (conf->mddev->reshape_position != MaxSector)
+ degraded = raid5_calc_degraded(conf);
+
+ return degraded > conf->max_degraded;
}
struct stripe_head *
@@ -2648,6 +2659,28 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
+/*
+ * This helper wraps rcu_dereference_protected() and can be used when
+ * it is known that the nr_pending of the rdev is elevated.
+ */
+static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ atomic_read(&rcu_access_pointer(rdev)->nr_pending));
+}
+
+/*
+ * This helper wraps rcu_dereference_protected() and should be used
+ * when it is known that the mddev_lock() is held. This is safe
+ * seeing raid5_remove_disk() has the same lock held.
+ */
+static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
+ struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ lockdep_is_held(&mddev->reconfig_mutex));
+}
+
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
@@ -2674,9 +2707,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
@@ -2790,11 +2823,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
if (bi == &sh->dev[i].rreq) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (rdev)
replacement = 1;
else
@@ -2802,7 +2835,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed
* until all requests are finished.
*/
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
}
@@ -2863,34 +2896,31 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
unsigned long flags;
pr_debug("raid456: error called\n");
+ pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
+ mdname(mddev), bdevname(rdev->bdev, b));
+
spin_lock_irqsave(&conf->device_lock, flags);
+ set_bit(Faulty, &rdev->flags);
+ clear_bit(In_sync, &rdev->flags);
+ mddev->degraded = raid5_calc_degraded(conf);
- if (test_bit(In_sync, &rdev->flags) &&
- mddev->degraded == conf->max_degraded) {
- /*
- * Don't allow to achieve failed state
- * Don't try to recover this device
- */
+ if (has_failed(conf)) {
+ set_bit(MD_BROKEN, &conf->mddev->flags);
conf->recovery_disabled = mddev->recovery_disabled;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- return;
+
+ pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
+ mdname(mddev), mddev->degraded, conf->raid_disks);
+ } else {
+ pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), conf->raid_disks - mddev->degraded);
}
- set_bit(Faulty, &rdev->flags);
- clear_bit(In_sync, &rdev->flags);
- mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
- pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
- "md/raid:%s: Operation continuing on %d devices.\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- mdname(mddev),
- conf->raid_disks - mddev->degraded);
r5c_update_on_rdev_error(mddev, rdev);
}
@@ -5213,23 +5243,23 @@ finish:
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
/* rdev have been moved down */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
@@ -5256,6 +5286,7 @@ finish:
}
static void raid5_activate_delayed(struct r5conf *conf)
+ __must_hold(&conf->device_lock)
{
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
while (!list_empty(&conf->delayed_list)) {
@@ -5273,9 +5304,9 @@ static void raid5_activate_delayed(struct r5conf *conf)
}
static void activate_bit_delay(struct r5conf *conf,
- struct list_head *temp_inactive_list)
+ struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
- /* device_lock is held */
struct list_head head;
list_add(&head, &conf->bitmap_list);
list_del_init(&conf->bitmap_list);
@@ -5500,6 +5531,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
* handle_list.
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
@@ -6288,7 +6320,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
*/
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
@@ -6371,8 +6403,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
- __releases(&conf->device_lock)
- __acquires(&conf->device_lock)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;
@@ -7166,7 +7197,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt;
struct r5worker_group *new_group;
- int ret;
+ int ret = -ENOMEM;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -7225,6 +7256,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
+
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
@@ -7302,11 +7334,13 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
- if (raid5_alloc_percpu(conf) != 0)
+ ret = raid5_alloc_percpu(conf);
+ if (ret)
goto abort;
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
+ ret = -EIO;
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
@@ -7317,11 +7351,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
- disk->replacement = rdev;
+ RCU_INIT_POINTER(disk->replacement, rdev);
} else {
if (disk->rdev)
goto abort;
- disk->rdev = rdev;
+ RCU_INIT_POINTER(disk->rdev, rdev);
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -7370,6 +7404,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (grow_stripes(conf, conf->min_nr_stripes)) {
pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
+ ret = -ENOMEM;
goto abort;
} else
pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
@@ -7383,7 +7418,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
- if (register_shrinker(&conf->shrinker)) {
+ ret = register_shrinker(&conf->shrinker);
+ if (ret) {
pr_warn("md/raid:%s: couldn't register shrinker.\n",
mdname(mddev));
goto abort;
@@ -7394,17 +7430,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (!conf->thread) {
pr_warn("md/raid:%s: couldn't allocate thread.\n",
mdname(mddev));
+ ret = -ENOMEM;
goto abort;
}
return conf;
abort:
- if (conf) {
+ if (conf)
free_conf(conf);
- return ERR_PTR(-EIO);
- } else
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
@@ -7621,17 +7656,18 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
- rdev = conf->disks[i].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
- conf->disks[i].rdev = rdev;
+ rcu_assign_pointer(conf->disks[i].rdev, rdev);
}
if (!rdev)
continue;
- if (conf->disks[i].replacement &&
+ if (rcu_access_pointer(conf->disks[i].replacement) &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
@@ -7749,7 +7785,6 @@ static int raid5_run(struct mddev *mddev)
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
- mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
@@ -7828,8 +7863,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
static void print_raid5_conf (struct r5conf *conf)
{
+ struct md_rdev *rdev;
int i;
- struct disk_info *tmp;
pr_debug("RAID conf printout:\n");
if (!conf) {
@@ -7840,50 +7875,54 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
+ rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
- tmp = conf->disks + i;
- if (tmp->rdev)
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev)
pr_debug(" disk %d, o:%d, dev:%s\n",
- i, !test_bit(Faulty, &tmp->rdev->flags),
- bdevname(tmp->rdev->bdev, b));
+ i, !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev, b));
}
+ rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
{
int i;
struct r5conf *conf = mddev->private;
- struct disk_info *tmp;
+ struct md_rdev *rdev, *replacement;
int count = 0;
unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->disks + i;
- if (tmp->replacement
- && tmp->replacement->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->replacement->flags)
- && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
+ replacement = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
+ if (replacement
+ && replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &replacement->flags)
+ && !test_and_set_bit(In_sync, &replacement->flags)) {
/* Replacement has just become active. */
- if (!tmp->rdev
- || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ if (!rdev
+ || !test_and_clear_bit(In_sync, &rdev->flags))
count++;
- if (tmp->rdev) {
+ if (rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
- set_bit(Faulty, &tmp->rdev->flags);
+ set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
- tmp->rdev->sysfs_state);
+ rdev->sysfs_state);
}
- sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
- } else if (tmp->rdev
- && tmp->rdev->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ sysfs_notify_dirent_safe(replacement->sysfs_state);
+ } else if (rdev
+ && rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
- sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -7898,8 +7937,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct md_rdev **rdevp;
+ struct md_rdev __rcu **rdevp;
struct disk_info *p = conf->disks + number;
+ struct md_rdev *tmp;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
@@ -7917,9 +7957,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
log_exit(conf);
return 0;
}
- if (rdev == p->rdev)
+ if (rdev == rcu_access_pointer(p->rdev))
rdevp = &p->rdev;
- else if (rdev == p->replacement)
+ else if (rdev == rcu_access_pointer(p->replacement))
rdevp = &p->replacement;
else
return 0;
@@ -7939,18 +7979,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
- (!p->replacement || p->replacement == rdev) &&
+ (!rcu_access_pointer(p->replacement) ||
+ rcu_access_pointer(p->replacement) == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
*rdevp = NULL;
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
+ lockdep_assert_held(&mddev->reconfig_mutex);
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
- *rdevp = rdev;
+ rcu_assign_pointer(*rdevp, rdev);
}
}
if (!err) {
@@ -7958,17 +8000,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err)
goto abort;
}
- if (p->replacement) {
+
+ tmp = rcu_access_pointer(p->replacement);
+ if (tmp) {
/* We must have just cleared 'rdev' */
- p->rdev = p->replacement;
- clear_bit(Replacement, &p->replacement->flags);
+ rcu_assign_pointer(p->rdev, tmp);
+ clear_bit(Replacement, &tmp->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
- p->replacement = NULL;
+ rcu_assign_pointer(p->replacement, NULL);
if (!err)
- err = log_modify(conf, p->rdev, true);
+ err = log_modify(conf, tmp, true);
}
clear_bit(WantReplacement, &rdev->flags);
@@ -7984,6 +8028,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int ret, err = -EEXIST;
int disk;
struct disk_info *p;
+ struct md_rdev *tmp;
int first = 0;
int last = conf->raid_disks - 1;
@@ -8041,7 +8086,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
for (disk = first; disk <= last; disk++) {
p = conf->disks + disk;
- if (test_bit(WantReplacement, &p->rdev->flags) &&
+ tmp = rdev_mdlock_deref(mddev, p->rdev);
+ if (test_bit(WantReplacement, &tmp->flags) &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
@@ -8332,6 +8378,7 @@ static void end_reshape(struct r5conf *conf)
static void raid5_finish_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ struct md_rdev *rdev;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8343,10 +8390,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
- struct md_rdev *rdev = conf->disks[d].rdev;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].rdev);
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = conf->disks[d].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].replacement);
if (rdev)
clear_bit(In_sync, &rdev->flags);
}