diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 320 | 
1 files changed, 225 insertions, 95 deletions
| diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..866d4b5a144c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -201,11 +201,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)  		if (test_bit(STRIPE_HANDLE, &sh->state)) {  			if (test_bit(STRIPE_DELAYED, &sh->state)) {  				list_add_tail(&sh->lru, &conf->delayed_list); -				blk_plug_device(conf->mddev->queue); +				plugger_set_plug(&conf->plug);  			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&  				   sh->bm_seq - conf->seq_write > 0) {  				list_add_tail(&sh->lru, &conf->bitmap_list); -				blk_plug_device(conf->mddev->queue); +				plugger_set_plug(&conf->plug);  			} else {  				clear_bit(STRIPE_BIT_DELAY, &sh->state);  				list_add_tail(&sh->lru, &conf->handle_list); @@ -277,12 +277,13 @@ out:  	return sh;  } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh)  {  	struct page *p;  	int i; +	int num = sh->raid_conf->pool_size; -	for (i=0; i<num ; i++) { +	for (i = 0; i < num ; i++) {  		p = sh->dev[i].page;  		if (!p)  			continue; @@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)  	}  } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh)  {  	int i; +	int num = sh->raid_conf->pool_size; -	for (i=0; i<num; i++) { +	for (i = 0; i < num; i++) {  		struct page *page;  		if (!(page = alloc_page(GFP_KERNEL))) { @@ -364,8 +366,74 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,  	return NULL;  } +/* + * Need to check if array has failed when deciding whether to: + *  - start an array + *  - remove non-faulty devices + *  - add a spare + *  - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int has_failed(raid5_conf_t *conf) +{ +	int degraded; +	int i; +	if (conf->mddev->reshape_position == MaxSector) +		return conf->mddev->degraded > conf->max_degraded; + +	rcu_read_lock(); +	degraded = 0; +	for (i = 0; i < conf->previous_raid_disks; i++) { +		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); +		if (!rdev || test_bit(Faulty, &rdev->flags)) +			degraded++; +		else if (test_bit(In_sync, &rdev->flags)) +			; +		else +			/* not in-sync or faulty. +			 * If the reshape increases the number of devices, +			 * this is being recovered by the reshape, so +			 * this 'previous' section is not in_sync. +			 * If the number of devices is being reduced however, +			 * the device can only be part of the array if +			 * we are reverting a reshape, so this section will +			 * be in-sync. +			 */ +			if (conf->raid_disks >= conf->previous_raid_disks) +				degraded++; +	} +	rcu_read_unlock(); +	if (degraded > conf->max_degraded) +		return 1; +	rcu_read_lock(); +	degraded = 0; +	for (i = 0; i < conf->raid_disks; i++) { +		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); +		if (!rdev || test_bit(Faulty, &rdev->flags)) +			degraded++; +		else if (test_bit(In_sync, &rdev->flags)) +			; +		else +			/* not in-sync or faulty. +			 * If reshape increases the number of devices, this +			 * section has already been recovered, else it +			 * almost certainly hasn't. +			 */ +			if (conf->raid_disks <= conf->previous_raid_disks) +				degraded++; +	} +	rcu_read_unlock(); +	if (degraded > conf->max_degraded) +		return 1; +	return 0; +} +  static void unplug_slaves(mddev_t *mddev); -static void raid5_unplug_device(struct request_queue *q);  static struct stripe_head *  get_active_stripe(raid5_conf_t *conf, sector_t sector, @@ -395,7 +463,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,  						     < (conf->max_nr_stripes *3/4)  						     || !conf->inactive_blocked),  						    conf->device_lock, -						    raid5_unplug_device(conf->mddev->queue) +						    md_raid5_unplug_device(conf)  					);  				conf->inactive_blocked = 0;  			} else @@ -1240,19 +1308,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  static int grow_one_stripe(raid5_conf_t *conf)  {  	struct stripe_head *sh; -	int disks = max(conf->raid_disks, conf->previous_raid_disks);  	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);  	if (!sh)  		return 0; -	memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); +	memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));  	sh->raid_conf = conf;  	spin_lock_init(&sh->lock);  	#ifdef CONFIG_MULTICORE_RAID456  	init_waitqueue_head(&sh->ops.wait_for_ops);  	#endif -	if (grow_buffers(sh, disks)) { -		shrink_buffers(sh, disks); +	if (grow_buffers(sh)) { +		shrink_buffers(sh);  		kmem_cache_free(conf->slab_cache, sh);  		return 0;  	} @@ -1269,10 +1336,14 @@ static int grow_stripes(raid5_conf_t *conf, int num)  	struct kmem_cache *sc;  	int devs = max(conf->raid_disks, conf->previous_raid_disks); -	sprintf(conf->cache_name[0], -		"raid%d-%s", conf->level, mdname(conf->mddev)); -	sprintf(conf->cache_name[1], -		"raid%d-%s-alt", conf->level, mdname(conf->mddev)); +	if (conf->mddev->gendisk) +		sprintf(conf->cache_name[0], +			"raid%d-%s", conf->level, mdname(conf->mddev)); +	else +		sprintf(conf->cache_name[0], +			"raid%d-%p", conf->level, conf->mddev); +	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); +  	conf->active_name = 0;  	sc = kmem_cache_create(conf->cache_name[conf->active_name],  			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -1468,7 +1539,7 @@ static int drop_one_stripe(raid5_conf_t *conf)  	if (!sh)  		return 0;  	BUG_ON(atomic_read(&sh->count)); -	shrink_buffers(sh, conf->pool_size); +	shrink_buffers(sh);  	kmem_cache_free(conf->slab_cache, sh);  	atomic_dec(&conf->active_stripes);  	return 1; @@ -2963,7 +3034,6 @@ static void handle_stripe5(struct stripe_head *sh)  		mdk_rdev_t *rdev;  		dev = &sh->dev[i]; -		clear_bit(R5_Insync, &dev->flags);  		pr_debug("check %d: state 0x%lx toread %p read %p write %p "  			"written %p\n",	i, dev->flags, dev->toread, dev->read, @@ -3000,17 +3070,27 @@ static void handle_stripe5(struct stripe_head *sh)  			blocked_rdev = rdev;  			atomic_inc(&rdev->nr_pending);  		} -		if (!rdev || !test_bit(In_sync, &rdev->flags)) { +		clear_bit(R5_Insync, &dev->flags); +		if (!rdev) +			/* Not in-sync */; +		else if (test_bit(In_sync, &rdev->flags)) +			set_bit(R5_Insync, &dev->flags); +		else { +			/* could be in-sync depending on recovery/reshape status */ +			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) +				set_bit(R5_Insync, &dev->flags); +		} +		if (!test_bit(R5_Insync, &dev->flags)) {  			/* The ReadError flag will just be confusing now */  			clear_bit(R5_ReadError, &dev->flags);  			clear_bit(R5_ReWrite, &dev->flags);  		} -		if (!rdev || !test_bit(In_sync, &rdev->flags) -		    || test_bit(R5_ReadError, &dev->flags)) { +		if (test_bit(R5_ReadError, &dev->flags)) +			clear_bit(R5_Insync, &dev->flags); +		if (!test_bit(R5_Insync, &dev->flags)) {  			s.failed++;  			s.failed_num = i; -		} else -			set_bit(R5_Insync, &dev->flags); +		}  	}  	rcu_read_unlock(); @@ -3244,7 +3324,6 @@ static void handle_stripe6(struct stripe_head *sh)  	for (i=disks; i--; ) {  		mdk_rdev_t *rdev;  		dev = &sh->dev[i]; -		clear_bit(R5_Insync, &dev->flags);  		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",  			i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -3282,18 +3361,28 @@ static void handle_stripe6(struct stripe_head *sh)  			blocked_rdev = rdev;  			atomic_inc(&rdev->nr_pending);  		} -		if (!rdev || !test_bit(In_sync, &rdev->flags)) { +		clear_bit(R5_Insync, &dev->flags); +		if (!rdev) +			/* Not in-sync */; +		else if (test_bit(In_sync, &rdev->flags)) +			set_bit(R5_Insync, &dev->flags); +		else { +			/* in sync if before recovery_offset */ +			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) +				set_bit(R5_Insync, &dev->flags); +		} +		if (!test_bit(R5_Insync, &dev->flags)) {  			/* The ReadError flag will just be confusing now */  			clear_bit(R5_ReadError, &dev->flags);  			clear_bit(R5_ReWrite, &dev->flags);  		} -		if (!rdev || !test_bit(In_sync, &rdev->flags) -		    || test_bit(R5_ReadError, &dev->flags)) { +		if (test_bit(R5_ReadError, &dev->flags)) +			clear_bit(R5_Insync, &dev->flags); +		if (!test_bit(R5_Insync, &dev->flags)) {  			if (s.failed < 2)  				r6s.failed_num[s.failed] = i;  			s.failed++; -		} else -			set_bit(R5_Insync, &dev->flags); +		}  	}  	rcu_read_unlock(); @@ -3528,7 +3617,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)  			list_add_tail(&sh->lru, &conf->hold_list);  		}  	} else -		blk_plug_device(conf->mddev->queue); +		plugger_set_plug(&conf->plug);  }  static void activate_bit_delay(raid5_conf_t *conf) @@ -3569,36 +3658,44 @@ static void unplug_slaves(mddev_t *mddev)  	rcu_read_unlock();  } -static void raid5_unplug_device(struct request_queue *q) +void md_raid5_unplug_device(raid5_conf_t *conf)  { -	mddev_t *mddev = q->queuedata; -	raid5_conf_t *conf = mddev->private;  	unsigned long flags;  	spin_lock_irqsave(&conf->device_lock, flags); -	if (blk_remove_plug(q)) { +	if (plugger_remove_plug(&conf->plug)) {  		conf->seq_flush++;  		raid5_activate_delayed(conf);  	} -	md_wakeup_thread(mddev->thread); +	md_wakeup_thread(conf->mddev->thread);  	spin_unlock_irqrestore(&conf->device_lock, flags); -	unplug_slaves(mddev); +	unplug_slaves(conf->mddev);  } +EXPORT_SYMBOL_GPL(md_raid5_unplug_device); -static int raid5_congested(void *data, int bits) +static void raid5_unplug(struct plug_handle *plug) +{ +	raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); +	md_raid5_unplug_device(conf); +} + +static void raid5_unplug_queue(struct request_queue *q) +{ +	mddev_t *mddev = q->queuedata; +	md_raid5_unplug_device(mddev->private); +} + +int md_raid5_congested(mddev_t *mddev, int bits)  { -	mddev_t *mddev = data;  	raid5_conf_t *conf = mddev->private;  	/* No difference between reads and writes.  Just check  	 * how busy the stripe_cache is  	 */ -	if (mddev_congested(mddev, bits)) -		return 1;  	if (conf->inactive_blocked)  		return 1;  	if (conf->quiesce) @@ -3608,6 +3705,15 @@ static int raid5_congested(void *data, int bits)  	return 0;  } +EXPORT_SYMBOL_GPL(md_raid5_congested); + +static int raid5_congested(void *data, int bits) +{ +	mddev_t *mddev = data; + +	return mddev_congested(mddev, bits) || +		md_raid5_congested(mddev, bits); +}  /* We want read requests to align with chunks where possible,   * but write requests don't need to. @@ -3872,7 +3978,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)  	const int rw = bio_data_dir(bi);  	int remaining; -	if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { +	if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {  		/* Drain all pending writes.  We only really need  		 * to ensure they have been submitted, but this is  		 * easier. @@ -3989,7 +4095,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)  				 * add failed due to overlap.  Flush everything  				 * and wait a while  				 */ -				raid5_unplug_device(mddev->queue); +				md_raid5_unplug_device(conf);  				release_stripe(sh);  				schedule();  				goto retry; @@ -4480,23 +4586,15 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)  		return 0;  } -static ssize_t -raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) +int +raid5_set_cache_size(mddev_t *mddev, int size)  {  	raid5_conf_t *conf = mddev->private; -	unsigned long new;  	int err; -	if (len >= PAGE_SIZE) -		return -EINVAL; -	if (!conf) -		return -ENODEV; - -	if (strict_strtoul(page, 10, &new)) +	if (size <= 16 || size > 32768)  		return -EINVAL; -	if (new <= 16 || new > 32768) -		return -EINVAL; -	while (new < conf->max_nr_stripes) { +	while (size < conf->max_nr_stripes) {  		if (drop_one_stripe(conf))  			conf->max_nr_stripes--;  		else @@ -4505,11 +4603,32 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)  	err = md_allow_write(mddev);  	if (err)  		return err; -	while (new > conf->max_nr_stripes) { +	while (size > conf->max_nr_stripes) {  		if (grow_one_stripe(conf))  			conf->max_nr_stripes++;  		else break;  	} +	return 0; +} +EXPORT_SYMBOL(raid5_set_cache_size); + +static ssize_t +raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) +{ +	raid5_conf_t *conf = mddev->private; +	unsigned long new; +	int err; + +	if (len >= PAGE_SIZE) +		return -EINVAL; +	if (!conf) +		return -ENODEV; + +	if (strict_strtoul(page, 10, &new)) +		return -EINVAL; +	err = raid5_set_cache_size(mddev, new); +	if (err) +		return err;  	return len;  } @@ -4872,7 +4991,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded  static int run(mddev_t *mddev)  {  	raid5_conf_t *conf; -	int working_disks = 0, chunk_size; +	int working_disks = 0;  	int dirty_parity_disks = 0;  	mdk_rdev_t *rdev;  	sector_t reshape_offset = 0; @@ -4971,8 +5090,10 @@ static int run(mddev_t *mddev)  	list_for_each_entry(rdev, &mddev->disks, same_set) {  		if (rdev->raid_disk < 0)  			continue; -		if (test_bit(In_sync, &rdev->flags)) +		if (test_bit(In_sync, &rdev->flags)) {  			working_disks++; +			continue; +		}  		/* This disc is not fully in-sync.  However if it  		 * just stored parity (beyond the recovery_offset),  		 * when we don't need to be concerned about the @@ -5005,7 +5126,7 @@ static int run(mddev_t *mddev)  	mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)  			   - working_disks); -	if (mddev->degraded > conf->max_degraded) { +	if (has_failed(conf)) {  		printk(KERN_ERR "md/raid:%s: not enough operational devices"  			" (%d/%d failed)\n",  			mdname(mddev), mddev->degraded, conf->raid_disks); @@ -5056,42 +5177,47 @@ static int run(mddev_t *mddev)  							"reshape");  	} -	/* read-ahead size must cover two whole stripes, which is -	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices -	 */ -	{ -		int data_disks = conf->previous_raid_disks - conf->max_degraded; -		int stripe = data_disks * -			((mddev->chunk_sectors << 9) / PAGE_SIZE); -		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) -			mddev->queue->backing_dev_info.ra_pages = 2 * stripe; -	}  	/* Ok, everything is just fine now */  	if (mddev->to_remove == &raid5_attrs_group)  		mddev->to_remove = NULL; -	else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) +	else if (mddev->kobj.sd && +	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))  		printk(KERN_WARNING -		       "md/raid:%s: failed to create sysfs attributes.\n", +		       "raid5: failed to create sysfs attributes for %s\n",  		       mdname(mddev)); +	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); -	mddev->queue->queue_lock = &conf->device_lock; +	plugger_init(&conf->plug, raid5_unplug); +	mddev->plug = &conf->plug; +	if (mddev->queue) { +		int chunk_size; +		/* read-ahead size must cover two whole stripes, which +		 * is 2 * (datadisks) * chunksize where 'n' is the +		 * number of raid devices +		 */ +		int data_disks = conf->previous_raid_disks - conf->max_degraded; +		int stripe = data_disks * +			((mddev->chunk_sectors << 9) / PAGE_SIZE); +		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) +			mddev->queue->backing_dev_info.ra_pages = 2 * stripe; -	mddev->queue->unplug_fn = raid5_unplug_device; -	mddev->queue->backing_dev_info.congested_data = mddev; -	mddev->queue->backing_dev_info.congested_fn = raid5_congested; +		blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); -	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); +		mddev->queue->backing_dev_info.congested_data = mddev; +		mddev->queue->backing_dev_info.congested_fn = raid5_congested; +		mddev->queue->queue_lock = &conf->device_lock; +		mddev->queue->unplug_fn = raid5_unplug_queue; -	blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); -	chunk_size = mddev->chunk_sectors << 9; -	blk_queue_io_min(mddev->queue, chunk_size); -	blk_queue_io_opt(mddev->queue, chunk_size * -			 (conf->raid_disks - conf->max_degraded)); +		chunk_size = mddev->chunk_sectors << 9; +		blk_queue_io_min(mddev->queue, chunk_size); +		blk_queue_io_opt(mddev->queue, chunk_size * +				 (conf->raid_disks - conf->max_degraded)); -	list_for_each_entry(rdev, &mddev->disks, same_set) -		disk_stack_limits(mddev->gendisk, rdev->bdev, -				  rdev->data_offset << 9); +		list_for_each_entry(rdev, &mddev->disks, same_set) +			disk_stack_limits(mddev->gendisk, rdev->bdev, +					  rdev->data_offset << 9); +	}  	return 0;  abort: @@ -5112,8 +5238,9 @@ static int stop(mddev_t *mddev)  	md_unregister_thread(mddev->thread);  	mddev->thread = NULL; -	mddev->queue->backing_dev_info.congested_fn = NULL; -	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +	if (mddev->queue) +		mddev->queue->backing_dev_info.congested_fn = NULL; +	plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/  	free_conf(conf);  	mddev->private = NULL;  	mddev->to_remove = &raid5_attrs_group; @@ -5207,6 +5334,7 @@ static int raid5_spare_active(mddev_t *mddev)  	for (i = 0; i < conf->raid_disks; i++) {  		tmp = conf->disks + i;  		if (tmp->rdev +		    && tmp->rdev->recovery_offset == MaxSector  		    && !test_bit(Faulty, &tmp->rdev->flags)  		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {  			unsigned long flags; @@ -5242,7 +5370,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)  		 * isn't possible.  		 */  		if (!test_bit(Faulty, &rdev->flags) && -		    mddev->degraded <= conf->max_degraded && +		    !has_failed(conf) &&  		    number < conf->raid_disks) {  			err = -EBUSY;  			goto abort; @@ -5270,7 +5398,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)  	int first = 0;  	int last = conf->raid_disks - 1; -	if (mddev->degraded > conf->max_degraded) +	if (has_failed(conf))  		/* no point adding a device */  		return -EINVAL; @@ -5362,7 +5490,7 @@ static int check_reshape(mddev_t *mddev)  	if (mddev->bitmap)  		/* Cannot grow a bitmap yet */  		return -EBUSY; -	if (mddev->degraded > conf->max_degraded) +	if (has_failed(conf))  		return -EINVAL;  	if (mddev->delta_disks < 0) {  		/* We might be able to shrink, but the devices must @@ -5437,8 +5565,13 @@ static int raid5_start_reshape(mddev_t *mddev)  	/* Add some new drives, as many as will fit.  	 * We know there are enough to make the newly sized array work. +	 * Don't add devices if we are reducing the number of +	 * devices in the array.  This is because it is not possible +	 * to correctly record the "partially reconstructed" state of +	 * such devices during the reshape and confusion could result.  	 */ -	list_for_each_entry(rdev, &mddev->disks, same_set) +	if (mddev->delta_disks >= 0) +	    list_for_each_entry(rdev, &mddev->disks, same_set)  		if (rdev->raid_disk < 0 &&  		    !test_bit(Faulty, &rdev->flags)) {  			if (raid5_add_disk(mddev, rdev) == 0) { @@ -5451,16 +5584,13 @@ static int raid5_start_reshape(mddev_t *mddev)  				sprintf(nm, "rd%d", rdev->raid_disk);  				if (sysfs_create_link(&mddev->kobj,  						      &rdev->kobj, nm)) -					printk(KERN_WARNING -					       "md/raid:%s: failed to create " -					       " link %s\n", -					       mdname(mddev), nm); +					/* Failure here is OK */;  			} else  				break;  		}  	/* When a reshape changes the number of devices, ->degraded -	 * is measured against the large of the pre and post number of +	 * is measured against the larger of the pre and post number of  	 * devices.*/  	if (mddev->delta_disks > 0) {  		spin_lock_irqsave(&conf->device_lock, flags); @@ -5509,7 +5639,7 @@ static void end_reshape(raid5_conf_t *conf)  		/* read-ahead size must cover two whole stripes, which is  		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices  		 */ -		{ +		if (conf->mddev->queue) {  			int data_disks = conf->raid_disks - conf->max_degraded;  			int stripe = data_disks * ((conf->chunk_sectors << 9)  						   / PAGE_SIZE); |