diff options
Diffstat (limited to 'drivers/md/md.c')
| -rw-r--r-- | drivers/md/md.c | 559 |
1 files changed, 407 insertions, 152 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 01175dac0db6..8cdca0296749 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -44,6 +44,7 @@ */ +#include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/badblocks.h> @@ -64,6 +65,8 @@ #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> #include <linux/slab.h> +#include <linux/percpu-refcount.h> + #include <trace/events/block.h> #include "md.h" #include "bitmap.h" @@ -171,8 +174,18 @@ static const struct block_device_operations md_fops; static int start_readonly; +/* + * The original mechanism for creating an md device is to create + * a device node in /dev and to open it. This causes races with device-close. + * The preferred method is to write to the "new_array" module parameter. + * This can avoid races. + * Setting create_on_open to false disables the original mechanism + * so all the races disappear. + */ +static bool create_on_open = true; + /* bio_clone_mddev - * like bio_clone, but with a local bio set + * like bio_clone_bioset, but with a local bio set */ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, @@ -190,15 +203,13 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) +static struct bio *md_bio_alloc_sync(struct mddev *mddev) { - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); + if (!mddev || !mddev->sync_set) + return bio_alloc(GFP_NOIO, 1); - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); + return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); } -EXPORT_SYMBOL_GPL(bio_clone_mddev); /* * We have a system wide 'event count' that is incremented @@ -262,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) unsigned int sectors; int cpu; - blk_queue_split(q, &bio, q->bio_split); + blk_queue_split(q, &bio); if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); @@ -270,11 +281,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) - bio->bi_error = -EROFS; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } - smp_rmb(); /* Ensure implications of 'active' are visible */ +check_suspended: rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -299,7 +310,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - mddev->pers->make_request(mddev, bio); + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -324,6 +339,7 @@ void mddev_suspend(struct mddev *mddev) if (mddev->suspended++) return; synchronize_rcu(); + wake_up(&mddev->sb_wait); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); @@ -449,14 +465,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); @@ -467,7 +475,7 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { - struct bio_set *bs = NULL; + struct bio_set *bs = NULL, *sync_bs = NULL; if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -477,7 +485,9 @@ static void mddev_put(struct mddev *mddev) * so destroy it */ list_del_init(&mddev->all_mddevs); bs = mddev->bio_set; + sync_bs = mddev->sync_set; mddev->bio_set = NULL; + mddev->sync_set = NULL; if (mddev->gendisk) { /* We did a probe so need to clean up. Call * queue_work inside the spinlock so that @@ -492,6 +502,8 @@ static void mddev_put(struct mddev *mddev) spin_unlock(&all_mddevs_lock); if (bs) bioset_free(bs); + if (sync_bs) + bioset_free(sync_bs); } static void md_safemode_timeout(unsigned long data); @@ -724,8 +736,8 @@ static void super_written(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (bio->bi_error) { - pr_err("md: super_written gets error=%d\n", bio->bi_error); + if (bio->bi_status) { + pr_err("md: super_written gets error=%d\n", bio->bi_status); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -756,7 +768,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + bio = md_bio_alloc_sync(mddev); atomic_inc(&rdev->nr_pending); @@ -770,7 +782,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, test_bit(FailFast, &rdev->flags) && !test_bit(LastDev, &rdev->flags)) ff = MD_FAILFAST; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff; atomic_inc(&mddev->pending_writes); submit_bio(bio); @@ -788,7 +800,7 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); + struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? @@ -806,7 +818,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, submit_bio_wait(bio); - ret = !bio->bi_error; + ret = !bio->bi_status; bio_put(bio); return ret; } @@ -830,7 +842,7 @@ fail: return -EINVAL; } -static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) +static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && @@ -838,7 +850,7 @@ static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) sb1->set_uuid3 == sb2->set_uuid3; } -static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) +static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) { int ret; mdp_super_t *tmp1, *tmp2; @@ -1030,12 +1042,12 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); - if (!uuid_equal(refsb, sb)) { + if (!md_uuid_equal(refsb, sb)) { pr_warn("md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); goto abort; } - if (!sb_equal(refsb, sb)) { + if (!md_sb_equal(refsb, sb)) { pr_warn("md: %s has same UUID but different superblock to %s\n", b, bdevname(refdev->bdev, b2)); goto abort; @@ -1524,6 +1536,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } else if (sb->bblog_offset != 0) rdev->badblocks.shift = 0; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); + rdev->ppl.size = le16_to_cpu(sb->ppl.size); + rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; + } + if (!refdev) { ret = 1; } else { @@ -1636,6 +1654,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); + + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) { + if (le32_to_cpu(sb->feature_map) & + (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) + return -EINVAL; + set_bit(MD_HAS_PPL, &mddev->flags); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ @@ -1844,11 +1869,17 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); + if (test_bit(MD_HAS_PPL, &mddev->flags)) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); + sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); + sb->ppl.size = cpu_to_le16(rdev->ppl.size); + } + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) @@ -1896,7 +1927,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2089,6 +2120,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; + if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && + mddev->pers) + return -EROFS; + /* make sure rdev->sectors exceeds mddev->dev_sectors */ if (!test_bit(Journal, &rdev->flags) && rdev->sectors && @@ -2250,6 +2285,33 @@ static void export_array(struct mddev *mddev) mddev->major_version = 0; } +static bool set_in_sync(struct mddev *mddev) +{ + WARN_ON_ONCE(!spin_is_locked(&mddev->lock)); + if (!mddev->in_sync) { + mddev->sync_checkers++; + spin_unlock(&mddev->lock); + percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); + spin_lock(&mddev->lock); + if (!mddev->in_sync && + percpu_ref_is_zero(&mddev->writes_pending)) { + mddev->in_sync = 1; + /* + * Ensure ->in_sync is visible before we clear + * ->sync_checkers. + */ + smp_mb(); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + sysfs_notify_dirent_safe(mddev->sysfs_state); + } + if (--mddev->sync_checkers == 0) + percpu_ref_switch_to_percpu(&mddev->writes_pending); + } + if (mddev->safemode == 1) + mddev->safemode = 0; + return mddev->in_sync; +} + static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but @@ -2304,7 +2366,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; @@ -3148,6 +3210,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) static struct rdev_sysfs_entry rdev_unack_bad_blocks = __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); +static ssize_t +ppl_sector_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); +} + +static ssize_t +ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long sector; + + if (kstrtoull(buf, 10, §or) < 0) + return -EINVAL; + if (sector != (sector_t)sector) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if ((sector > rdev->sb_start && + sector - rdev->sb_start > S16_MAX) || + (sector < rdev->sb_start && + rdev->sb_start - sector > -S16_MIN)) + return -EINVAL; + rdev->ppl.offset = sector - rdev->sb_start; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.sector = sector; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_sector = +__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); + +static ssize_t +ppl_size_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%u\n", rdev->ppl.size); +} + +static ssize_t +ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned int size; + + if (kstrtouint(buf, 10, &size) < 0) + return -EINVAL; + + if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && + rdev->raid_disk >= 0) + return -EBUSY; + + if (rdev->mddev->persistent) { + if (rdev->mddev->major_version == 0) + return -EINVAL; + if (size > U16_MAX) + return -EINVAL; + } else if (!rdev->mddev->external) { + return -EBUSY; + } + rdev->ppl.size = size; + return len; +} + +static struct rdev_sysfs_entry rdev_ppl_size = +__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, @@ -3158,6 +3292,8 @@ static struct attribute *rdev_default_attrs[] = { &rdev_recovery_start.attr, &rdev_bad_blocks.attr, &rdev_unack_bad_blocks.attr, + &rdev_ppl_sector.attr, + &rdev_ppl_size.attr, NULL, }; static ssize_t @@ -3920,6 +4056,7 @@ array_state_show(struct mddev *mddev, char *page) st = read_auto; break; case 0: + spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; else if (mddev->in_sync) @@ -3928,6 +4065,7 @@ array_state_show(struct mddev *mddev, char *page) st = active_idle; else st = active; + spin_unlock(&mddev->lock); } else { if (list_empty(&mddev->disks) && @@ -3948,7 +4086,7 @@ static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err; + int err = 0; enum array_state st = match_word(buf, array_states); if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { @@ -3961,18 +4099,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - err = 0; } else /* st == clean */ { restart_array(mddev); - if (atomic_read(&mddev->writes_pending) == 0) { - if (mddev->in_sync == 0) { - mddev->in_sync = 1; - if (mddev->safemode == 1) - mddev->safemode = 0; - set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - } - err = 0; - } else + if (!set_in_sync(mddev)) err = -EBUSY; } if (!err) @@ -4030,15 +4159,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) if (err) break; spin_lock(&mddev->lock); - if (atomic_read(&mddev->writes_pending) == 0) { - if (mddev->in_sync == 0) { - mddev->in_sync = 1; - if (mddev->safemode == 1) - mddev->safemode = 0; - set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - } - err = 0; - } else + if (!set_in_sync(mddev)) err = -EBUSY; spin_unlock(&mddev->lock); } else @@ -4860,8 +4981,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) return err; /* cluster raid doesn't support change array_sectors */ - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev)) { + mddev_unlock(mddev); return -EINVAL; + } if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4894,6 +5017,52 @@ static struct md_sysfs_entry md_array_size = __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, array_size_store); +static ssize_t +consistency_policy_show(struct mddev *mddev, char *page) +{ + int ret; + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + ret = sprintf(page, "journal\n"); + } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { + ret = sprintf(page, "ppl\n"); + } else if (mddev->bitmap) { + ret = sprintf(page, "bitmap\n"); + } else if (mddev->pers) { + if (mddev->pers->sync_request) + ret = sprintf(page, "resync\n"); + else + ret = sprintf(page, "none\n"); + } else { + ret = sprintf(page, "unknown\n"); + } + + return ret; +} + +static ssize_t +consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err = 0; + + if (mddev->pers) { + if (mddev->pers->change_consistency_policy) + err = mddev->pers->change_consistency_policy(mddev, buf); + else + err = -EBUSY; + } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { + set_bit(MD_HAS_PPL, &mddev->flags); + } else { + err = -EINVAL; + } + + return err ? err : len; +} + +static struct md_sysfs_entry md_consistency_policy = +__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, + consistency_policy_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -4909,6 +5078,7 @@ static struct attribute *md_default_attrs[] = { &md_reshape_direction.attr, &md_array_size.attr, &max_corr_read_errors.attr, + &md_consistency_policy.attr, NULL, }; @@ -4993,6 +5163,7 @@ static void md_free(struct kobject *ko) del_gendisk(mddev->gendisk); put_disk(mddev->gendisk); } + percpu_ref_exit(&mddev->writes_pending); kfree(mddev); } @@ -5018,8 +5189,31 @@ static void mddev_delayed_delete(struct work_struct *ws) kobject_put(&mddev->kobj); } +static void no_op(struct percpu_ref *r) {} + +int mddev_init_writes_pending(struct mddev *mddev) +{ + if (mddev->writes_pending.percpu_count_ptr) + return 0; + if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0) + return -ENOMEM; + /* We want to start with the refcount at zero */ + percpu_ref_put(&mddev->writes_pending); + return 0; +} +EXPORT_SYMBOL_GPL(mddev_init_writes_pending); + static int md_alloc(dev_t dev, char *name) { + /* + * If dev is zero, name is the name of a device to allocate with + * an arbitrary minor number. It will be "md_???" + * If dev is non-zero it must be a device number with a MAJOR of + * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then + * the device is being created by opening a node in /dev. + * If "name" is not NULL, the device is being created by + * writing to /sys/module/md_mod/parameters/new_array. + */ static DEFINE_MUTEX(disks_mutex); struct mddev *mddev = mddev_find(dev); struct gendisk *disk; @@ -5045,7 +5239,7 @@ static int md_alloc(dev_t dev, char *name) if (mddev->gendisk) goto abort; - if (name) { + if (name && !dev) { /* Need to ensure that 'name' is not a duplicate. */ struct mddev *mddev2; @@ -5059,6 +5253,11 @@ static int md_alloc(dev_t dev, char *name) } spin_unlock(&all_mddevs_lock); } + if (name && dev) + /* + * Creating /dev/mdNNN via "newarray", so adjust hold_active. + */ + mddev->hold_active = UNTIL_STOP; error = -ENOMEM; mddev->queue = blk_alloc_queue(GFP_KERNEL); @@ -5125,38 +5324,48 @@ static int md_alloc(dev_t dev, char *name) static struct kobject *md_probe(dev_t dev, int *part, void *data) { - md_alloc(dev, NULL); + if (create_on_open) + md_alloc(dev, NULL); return NULL; } static int add_named_array(const char *val, struct kernel_param *kp) { - /* val must be "md_*" where * is not all digits. - * We allocate an array with a large free minor number, and + /* + * val must be "md_*" or "mdNNN". + * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. + * For "mdNNN" we allocate an array with the minor number NNN + * which must not already be in use. */ int len = strlen(val); char buf[DISK_NAME_LEN]; + unsigned long devnum; while (len && val[len-1] == '\n') len--; if (len >= DISK_NAME_LEN) return -E2BIG; strlcpy(buf, val, len+1); - if (strncmp(buf, "md_", 3) != 0) - return -EINVAL; - return md_alloc(0, buf); + if (strncmp(buf, "md_", 3) == 0) + return md_alloc(0, buf); + if (strncmp(buf, "md", 2) == 0 && + isdigit(buf[2]) && + kstrtoul(buf+2, 10, &devnum) == 0 && + devnum <= MINORMASK) + return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); + + return -EINVAL; } static void md_safemode_timeout(unsigned long data) { struct mddev *mddev = (struct mddev *) data; - if (!atomic_read(&mddev->writes_pending)) { - mddev->safemode = 1; - if (mddev->external) - sysfs_notify_dirent_safe(mddev->sysfs_state); - } + mddev->safemode = 1; + if (mddev->external) + sysfs_notify_dirent_safe(mddev->sysfs_state); + md_wakeup_thread(mddev->thread); } @@ -5202,6 +5411,13 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); + if (mddev->ro != 1 && + (bdev_read_only(rdev->bdev) || + bdev_read_only(rdev->meta_bdev))) { + mddev->ro = 1; + if (mddev->gendisk) + set_disk_ro(mddev->gendisk, 1); + } /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, @@ -5228,8 +5444,16 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) - mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (mddev->bio_set == NULL) { + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (!mddev->bio_set) + return -ENOMEM; + } + if (mddev->sync_set == NULL) { + mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (!mddev->sync_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5346,8 +5570,8 @@ int md_run(struct mddev *mddev) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); else queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = md_congested; + mddev->queue->backing_dev_info->congested_data = mddev; + mddev->queue->backing_dev_info->congested_fn = md_congested; } if (pers->sync_request) { if (mddev->kobj.sd && @@ -5358,7 +5582,6 @@ int md_run(struct mddev *mddev) } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; - atomic_set(&mddev->writes_pending,0); atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; @@ -5424,6 +5647,9 @@ out: static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + bool has_journal = false; + bool has_readonly = false; /* Complain if it has no devices */ if (list_empty(&mddev->disks)) @@ -5432,24 +5658,21 @@ static int restart_array(struct mddev *mddev) return -EINVAL; if (!mddev->ro) return -EBUSY; - if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { - struct md_rdev *rdev; - bool has_journal = false; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - if (test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) { - has_journal = true; - break; - } - } - rcu_read_unlock(); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + has_journal = true; + if (bdev_read_only(rdev->bdev)) + has_readonly = true; + } + rcu_read_unlock(); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) /* Don't restart rw with journal missing/faulty */ - if (!has_journal) return -EINVAL; - } + if (has_readonly) + return -EROFS; mddev->safemode = 0; mddev->ro = 0; @@ -5549,15 +5772,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - struct bitmap *bitmap = mddev->bitmap; - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - pr_debug("md:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } + bitmap_wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -5570,6 +5785,7 @@ static void mddev_detach(struct mddev *mddev) static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; + bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ flush_workqueue(md_misc_wq); @@ -5590,7 +5806,6 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - bitmap_destroy(mddev); if (mddev->bio_set) bioset_free(mddev->bio_set); } @@ -5704,7 +5919,7 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->backing_dev_info.congested_fn = NULL; + mddev->queue->backing_dev_info->congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -5728,7 +5943,6 @@ static int do_md_stop(struct mddev *mddev, int mode, if (mode == 0) { pr_info("md: %s stopped.\n", mdname(mddev)); - bitmap_destroy(mddev); if (mddev->bitmap_info.file) { struct file *f = mddev->bitmap_info.file; spin_lock(&mddev->lock); @@ -6464,11 +6678,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->layout = info->layout; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) { - mddev->flags = 0; - mddev->sb_flags = 0; + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6508,10 +6721,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) struct md_rdev *rdev; int rv; int fit = (num_sectors == 0); - - /* cluster raid doesn't support update size */ - if (mddev_is_clustered(mddev)) - return -EINVAL; + sector_t old_dev_sectors = mddev->dev_sectors; if (mddev->pers->resize == NULL) return -EINVAL; @@ -6539,8 +6749,14 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->update_size(mddev, old_dev_sectors); + else if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + } return rv; } @@ -6787,6 +7003,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; int ro; + bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) return -ENOTTY; @@ -6876,7 +7093,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } + WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); set_bit(MD_CLOSING, &mddev->flags); + did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } @@ -7069,6 +7288,8 @@ unlock: mddev->hold_active = 0; mddev_unlock(mddev); out: + if(did_set_md_closing) + clear_bit(MD_CLOSING, &mddev->flags); return err; } #ifdef CONFIG_COMPAT @@ -7219,8 +7440,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); @@ -7751,12 +7972,14 @@ EXPORT_SYMBOL(md_done_sync); * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. */ -void md_write_start(struct mddev *mddev, struct bio *bi) +bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return; + return true; BUG_ON(mddev->ro == 1); if (mddev->ro == 2) { @@ -7767,10 +7990,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi) md_wakeup_thread(mddev->sync_thread); did_change = 1; } - atomic_inc(&mddev->writes_pending); + rcu_read_lock(); + percpu_ref_get(&mddev->writes_pending); + smp_mb(); /* Match smp_mb in set_in_sync() */ if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->in_sync) { + /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ + if (mddev->in_sync || !mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; @@ -7781,22 +8007,51 @@ void md_write_start(struct mddev *mddev, struct bio *bi) } spin_unlock(&mddev->lock); } + rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + percpu_ref_put(&mddev->writes_pending); + return false; + } + return true; } EXPORT_SYMBOL(md_write_start); +/* md_write_inc can only be called when md_write_start() has + * already been called at least once of the current request. + * It increments the counter and is useful when a single request + * is split into several parts. Each part causes an increment and + * so needs a matching md_write_end(). + * Unlike md_write_start(), it is safe to call md_write_inc() inside + * a spinlocked region. + */ +void md_write_inc(struct mddev *mddev, struct bio *bi) +{ + if (bio_data_dir(bi) != WRITE) + return; + WARN_ON_ONCE(mddev->in_sync || mddev->ro); + percpu_ref_get(&mddev->writes_pending); +} +EXPORT_SYMBOL(md_write_inc); + void md_write_end(struct mddev *mddev) { - if (atomic_dec_and_test(&mddev->writes_pending)) { - if (mddev->safemode == 2) - md_wakeup_thread(mddev->thread); - else if (mddev->safemode_delay) - mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); - } + percpu_ref_put(&mddev->writes_pending); + + if (mddev->safemode == 2) + md_wakeup_thread(mddev->thread); + else if (mddev->safemode_delay) + /* The roundup() ensures this only performs locking once + * every ->safemode_delay jiffies + */ + mod_timer(&mddev->safemode_timer, + roundup(jiffies, mddev->safemode_delay) + + mddev->safemode_delay); } + EXPORT_SYMBOL(md_write_end); /* md_allow_write(mddev) @@ -7804,18 +8059,15 @@ EXPORT_SYMBOL(md_write_end); * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. - * - * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(struct mddev *mddev) +void md_allow_write(struct mddev *mddev) { if (!mddev->pers) - return 0; + return; if (mddev->ro) - return 0; + return; if (!mddev->pers->sync_request) - return 0; + return; spin_lock(&mddev->lock); if (mddev->in_sync) { @@ -7828,13 +8080,12 @@ int md_allow_write(struct mddev *mddev) spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); - - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) - return -EAGAIN; - else - return 0; } EXPORT_SYMBOL_GPL(md_allow_write); @@ -8396,9 +8647,8 @@ void md_check_recovery(struct mddev *mddev) (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - test_bit(MD_RELOAD_SB, &mddev->flags) || (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) + (mddev->safemode == 2 && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) return; @@ -8445,27 +8695,12 @@ void md_check_recovery(struct mddev *mddev) rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); } - - if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) - md_reload_sb(mddev, mddev->good_device_nr); } - if (!mddev->external) { - int did_change = 0; + if (!mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); - if (mddev->safemode && - !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && - mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - did_change = 1; - set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - } - if (mddev->safemode == 1) - mddev->safemode = 0; + set_in_sync(mddev); spin_unlock(&mddev->lock); - if (did_change) - sysfs_notify_dirent_safe(mddev->sysfs_state); } if (mddev->sb_flags) @@ -8758,6 +8993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) int role, ret; char b[BDEVNAME_SIZE]; + /* + * If size is changed in another node then we need to + * do resize as well. + */ + if (mddev->dev_sectors != le64_to_cpu(sb->size)) { + ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); + if (ret) + pr_info("md-cluster: resize failed\n"); + else + bitmap_update_sb(mddev->bitmap); + } + /* Check for change of roles in the active devices */ rdev_for_each(rdev2, mddev) { if (test_bit(Faulty, &rdev2->flags)) @@ -8980,7 +9227,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); @@ -9001,6 +9255,7 @@ static int set_ro(const char *val, struct kernel_param *kp) module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); +module_param(create_on_open, bool, S_IRUSR|S_IWUSR); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); |