diff options
Diffstat (limited to 'drivers/md/raid5.c')
| -rw-r--r-- | drivers/md/raid5.c | 325 | 
1 files changed, 175 insertions, 150 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59e44e99eef3..15ef2c641b2b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,  	return slot;  } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi)  { -	struct bio *bi = return_bi; -	while (bi) { - -		return_bi = bi->bi_next; -		bi->bi_next = NULL; +	struct bio *bi; +	while ((bi = bio_list_pop(return_bi)) != NULL) {  		bi->bi_iter.bi_size = 0;  		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),  					 bi, 0); -		bio_endio(bi, 0); -		bi = return_bi; +		bio_endio(bi);  	}  } @@ -887,9 +883,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)  }  static void -raid5_end_read_request(struct bio *bi, int error); +raid5_end_read_request(struct bio *bi);  static void -raid5_end_write_request(struct bio *bi, int error); +raid5_end_write_request(struct bio *bi);  static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)  { @@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,  static void ops_complete_biofill(void *stripe_head_ref)  {  	struct stripe_head *sh = stripe_head_ref; -	struct bio *return_bi = NULL; +	struct bio_list return_bi = BIO_EMPTY_LIST;  	int i;  	pr_debug("%s: stripe %llu\n", __func__, @@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref)  			while (rbi && rbi->bi_iter.bi_sector <  				dev->sector + STRIPE_SECTORS) {  				rbi2 = r5_next_bio(rbi, dev->sector); -				if (!raid5_dec_bi_active_stripes(rbi)) { -					rbi->bi_next = return_bi; -					return_bi = rbi; -				} +				if (!raid5_dec_bi_active_stripes(rbi)) +					bio_list_add(&return_bi, rbi);  				rbi = rbi2;  			}  		}  	}  	clear_bit(STRIPE_BIOFILL_RUN, &sh->state); -	return_io(return_bi); +	return_io(&return_bi);  	set_bit(STRIPE_HANDLE, &sh->state);  	release_stripe(sh); @@ -2162,6 +2156,9 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	if (!sc)  		return -ENOMEM; +	/* Need to ensure auto-resizing doesn't interfere */ +	mutex_lock(&conf->cache_size_mutex); +  	for (i = conf->max_nr_stripes; i; i--) {  		nsh = alloc_stripe(sc, GFP_KERNEL);  		if (!nsh) @@ -2178,6 +2175,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)  			kmem_cache_free(sc, nsh);  		}  		kmem_cache_destroy(sc); +		mutex_unlock(&conf->cache_size_mutex);  		return -ENOMEM;  	}  	/* Step 2 - Must use GFP_NOIO now. @@ -2224,6 +2222,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	} else  		err = -ENOMEM; +	mutex_unlock(&conf->cache_size_mutex);  	/* Step 4, return new stripes to service */  	while(!list_empty(&newstripes)) {  		nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2251,7 +2250,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)  static int drop_one_stripe(struct r5conf *conf)  {  	struct stripe_head *sh; -	int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; +	int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;  	spin_lock_irq(conf->hash_locks + hash);  	sh = get_free_stripe(conf, hash); @@ -2277,12 +2276,11 @@ static void shrink_stripes(struct r5conf *conf)  	conf->slab_cache = NULL;  } -static void raid5_end_read_request(struct bio * bi, int error) +static void raid5_end_read_request(struct bio * bi)  {  	struct stripe_head *sh = bi->bi_private;  	struct r5conf *conf = sh->raid_conf;  	int disks = sh->disks, i; -	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);  	char b[BDEVNAME_SIZE];  	struct md_rdev *rdev = NULL;  	sector_t s; @@ -2291,9 +2289,9 @@ static void raid5_end_read_request(struct bio * bi, int error)  		if (bi == &sh->dev[i].req)  			break; -	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", +	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",  		(unsigned long long)sh->sector, i, atomic_read(&sh->count), -		uptodate); +		bi->bi_error);  	if (i == disks) {  		BUG();  		return; @@ -2312,7 +2310,7 @@ static void raid5_end_read_request(struct bio * bi, int error)  		s = sh->sector + rdev->new_data_offset;  	else  		s = sh->sector + rdev->data_offset; -	if (uptodate) { +	if (!bi->bi_error) {  		set_bit(R5_UPTODATE, &sh->dev[i].flags);  		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {  			/* Note that this cannot happen on a @@ -2400,13 +2398,12 @@ static void raid5_end_read_request(struct bio * bi, int error)  	release_stripe(sh);  } -static void raid5_end_write_request(struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi)  {  	struct stripe_head *sh = bi->bi_private;  	struct r5conf *conf = sh->raid_conf;  	int disks = sh->disks, i;  	struct md_rdev *uninitialized_var(rdev); -	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);  	sector_t first_bad;  	int bad_sectors;  	int replacement = 0; @@ -2429,23 +2426,23 @@ static void raid5_end_write_request(struct bio *bi, int error)  			break;  		}  	} -	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", +	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",  		(unsigned long long)sh->sector, i, atomic_read(&sh->count), -		uptodate); +		bi->bi_error);  	if (i == disks) {  		BUG();  		return;  	}  	if (replacement) { -		if (!uptodate) +		if (bi->bi_error)  			md_error(conf->mddev, rdev);  		else if (is_badblock(rdev, sh->sector,  				     STRIPE_SECTORS,  				     &first_bad, &bad_sectors))  			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);  	} else { -		if (!uptodate) { +		if (bi->bi_error) {  			set_bit(STRIPE_DEGRADED, &sh->state);  			set_bit(WriteErrorSeen, &rdev->flags);  			set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2466,7 +2463,7 @@ static void raid5_end_write_request(struct bio *bi, int error)  	}  	rdev_dec_pending(rdev, conf->mddev); -	if (sh->batch_head && !uptodate && !replacement) +	if (sh->batch_head && bi->bi_error && !replacement)  		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);  	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) @@ -2514,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)  	set_bit(Blocked, &rdev->flags);  	set_bit(Faulty, &rdev->flags);  	set_bit(MD_CHANGE_DEVS, &mddev->flags); +	set_bit(MD_CHANGE_PENDING, &mddev->flags);  	printk(KERN_ALERT  	       "md/raid:%s: Disk failure on %s, disabling device.\n"  	       "md/raid:%s: Operation continuing on %d devices.\n", @@ -3066,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,  static void  handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  				struct stripe_head_state *s, int disks, -				struct bio **return_bi) +				struct bio_list *return_bi)  {  	int i;  	BUG_ON(sh->batch_head); @@ -3107,11 +3105,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  		while (bi && bi->bi_iter.bi_sector <  			sh->dev[i].sector + STRIPE_SECTORS) {  			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); -			clear_bit(BIO_UPTODATE, &bi->bi_flags); + +			bi->bi_error = -EIO;  			if (!raid5_dec_bi_active_stripes(bi)) {  				md_write_end(conf->mddev); -				bi->bi_next = *return_bi; -				*return_bi = bi; +				bio_list_add(return_bi, bi);  			}  			bi = nextbi;  		} @@ -3131,11 +3129,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  		while (bi && bi->bi_iter.bi_sector <  		       sh->dev[i].sector + STRIPE_SECTORS) {  			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); -			clear_bit(BIO_UPTODATE, &bi->bi_flags); + +			bi->bi_error = -EIO;  			if (!raid5_dec_bi_active_stripes(bi)) {  				md_write_end(conf->mddev); -				bi->bi_next = *return_bi; -				*return_bi = bi; +				bio_list_add(return_bi, bi);  			}  			bi = bi2;  		} @@ -3156,11 +3154,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,  			       sh->dev[i].sector + STRIPE_SECTORS) {  				struct bio *nextbi =  					r5_next_bio(bi, sh->dev[i].sector); -				clear_bit(BIO_UPTODATE, &bi->bi_flags); -				if (!raid5_dec_bi_active_stripes(bi)) { -					bi->bi_next = *return_bi; -					*return_bi = bi; -				} + +				bi->bi_error = -EIO; +				if (!raid5_dec_bi_active_stripes(bi)) +					bio_list_add(return_bi, bi);  				bi = nextbi;  			}  		} @@ -3439,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,   * never LOCKED, so we don't need to test 'failed' directly.   */  static void handle_stripe_clean_event(struct r5conf *conf, -	struct stripe_head *sh, int disks, struct bio **return_bi) +	struct stripe_head *sh, int disks, struct bio_list *return_bi)  {  	int i;  	struct r5dev *dev; @@ -3473,8 +3470,7 @@ returnbi:  					wbi2 = r5_next_bio(wbi, dev->sector);  					if (!raid5_dec_bi_active_stripes(wbi)) {  						md_write_end(conf->mddev); -						wbi->bi_next = *return_bi; -						*return_bi = wbi; +						bio_list_add(return_bi, wbi);  					}  					wbi = wbi2;  				} @@ -4061,8 +4057,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)  				 &first_bad, &bad_sectors))  			set_bit(R5_ReadRepl, &dev->flags);  		else { -			if (rdev) +			if (rdev && !test_bit(Faulty, &rdev->flags))  				set_bit(R5_NeedReplace, &dev->flags); +			else +				clear_bit(R5_NeedReplace, &dev->flags);  			rdev = rcu_dereference(conf->disks[i].rdev);  			clear_bit(R5_ReadRepl, &dev->flags);  		} @@ -4605,7 +4603,15 @@ finish:  			md_wakeup_thread(conf->mddev->thread);  	} -	return_io(s.return_bi); +	if (!bio_list_empty(&s.return_bi)) { +		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { +			spin_lock_irq(&conf->device_lock); +			bio_list_merge(&conf->return_bi, &s.return_bi); +			spin_unlock_irq(&conf->device_lock); +			md_wakeup_thread(conf->mddev->thread); +		} else +			return_io(&s.return_bi); +	}  	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);  } @@ -4662,43 +4668,14 @@ static int raid5_congested(struct mddev *mddev, int bits)  	return 0;  } -/* We want read requests to align with chunks where possible, - * but write requests don't need to. - */ -static int raid5_mergeable_bvec(struct mddev *mddev, -				struct bvec_merge_data *bvm, -				struct bio_vec *biovec) -{ -	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); -	int max; -	unsigned int chunk_sectors = mddev->chunk_sectors; -	unsigned int bio_sectors = bvm->bi_size >> 9; - -	/* -	 * always allow writes to be mergeable, read as well if array -	 * is degraded as we'll go through stripe cache anyway. -	 */ -	if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) -		return biovec->bv_len; - -	if (mddev->new_chunk_sectors < mddev->chunk_sectors) -		chunk_sectors = mddev->new_chunk_sectors; -	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; -	if (max < 0) max = 0; -	if (max <= biovec->bv_len && bio_sectors == 0) -		return biovec->bv_len; -	else -		return max; -} -  static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)  { +	struct r5conf *conf = mddev->private;  	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); -	unsigned int chunk_sectors = mddev->chunk_sectors; +	unsigned int chunk_sectors;  	unsigned int bio_sectors = bio_sectors(bio); -	if (mddev->new_chunk_sectors < mddev->chunk_sectors) -		chunk_sectors = mddev->new_chunk_sectors; +	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);  	return  chunk_sectors >=  		((sector & (chunk_sectors - 1)) + bio_sectors);  } @@ -4749,13 +4726,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)   *  first).   *  If the read failed..   */ -static void raid5_align_endio(struct bio *bi, int error) +static void raid5_align_endio(struct bio *bi)  {  	struct bio* raid_bi  = bi->bi_private;  	struct mddev *mddev;  	struct r5conf *conf; -	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);  	struct md_rdev *rdev; +	int error = bi->bi_error;  	bio_put(bi); @@ -4766,10 +4743,10 @@ static void raid5_align_endio(struct bio *bi, int error)  	rdev_dec_pending(rdev, conf->mddev); -	if (!error && uptodate) { +	if (!error) {  		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),  					 raid_bi, 0); -		bio_endio(raid_bi, 0); +		bio_endio(raid_bi);  		if (atomic_dec_and_test(&conf->active_aligned_reads))  			wake_up(&conf->wait_for_quiescent);  		return; @@ -4780,26 +4757,7 @@ static void raid5_align_endio(struct bio *bi, int error)  	add_bio_to_retry(raid_bi, conf);  } -static int bio_fits_rdev(struct bio *bi) -{ -	struct request_queue *q = bdev_get_queue(bi->bi_bdev); - -	if (bio_sectors(bi) > queue_max_sectors(q)) -		return 0; -	blk_recount_segments(q, bi); -	if (bi->bi_phys_segments > queue_max_segments(q)) -		return 0; - -	if (q->merge_bvec_fn) -		/* it's too hard to apply the merge_bvec_fn at this stage, -		 * just just give up -		 */ -		return 0; - -	return 1; -} - -static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) +static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)  {  	struct r5conf *conf = mddev->private;  	int dd_idx; @@ -4808,7 +4766,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  	sector_t end_sector;  	if (!in_chunk_boundary(mddev, raid_bio)) { -		pr_debug("chunk_aligned_read : non aligned\n"); +		pr_debug("%s: non aligned\n", __func__);  		return 0;  	}  	/* @@ -4850,13 +4808,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  		rcu_read_unlock();  		raid_bio->bi_next = (void*)rdev;  		align_bi->bi_bdev =  rdev->bdev; -		__clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); +		bio_clear_flag(align_bi, BIO_SEG_VALID); -		if (!bio_fits_rdev(align_bi) || -		    is_badblock(rdev, align_bi->bi_iter.bi_sector, +		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,  				bio_sectors(align_bi),  				&first_bad, &bad_sectors)) { -			/* too big in some way, or has a known bad block */  			bio_put(align_bi);  			rdev_dec_pending(rdev, mddev);  			return 0; @@ -4885,6 +4841,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)  	}  } +static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) +{ +	struct bio *split; + +	do { +		sector_t sector = raid_bio->bi_iter.bi_sector; +		unsigned chunk_sects = mddev->chunk_sectors; +		unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + +		if (sectors < bio_sectors(raid_bio)) { +			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); +			bio_chain(split, raid_bio); +		} else +			split = raid_bio; + +		if (!raid5_read_one_chunk(mddev, split)) { +			if (split != raid_bio) +				generic_make_request(raid_bio); +			return split; +		} +	} while (split != raid_bio); + +	return NULL; +} +  /* __get_priority_stripe - get the next stripe to process   *   * Full stripe writes are allowed to pass preread active stripes up until @@ -5133,7 +5114,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)  	remaining = raid5_dec_bi_active_stripes(bi);  	if (remaining == 0) {  		md_write_end(mddev); -		bio_endio(bi, 0); +		bio_endio(bi);  	}  } @@ -5162,9 +5143,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)  	 * data on failed drives.  	 */  	if (rw == READ && mddev->degraded == 0 && -	     mddev->reshape_position == MaxSector && -	     chunk_aligned_read(mddev,bi)) -		return; +	    mddev->reshape_position == MaxSector) { +		bi = chunk_aligned_read(mddev, bi); +		if (!bi) +			return; +	}  	if (unlikely(bi->bi_rw & REQ_DISCARD)) {  		make_discard_request(mddev, bi); @@ -5297,7 +5280,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)  			release_stripe_plug(mddev, sh);  		} else {  			/* cannot get stripe for read-ahead, just give-up */ -			clear_bit(BIO_UPTODATE, &bi->bi_flags); +			bi->bi_error = -EIO;  			break;  		}  	} @@ -5311,7 +5294,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)  		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),  					 bi, 0); -		bio_endio(bi, 0); +		bio_endio(bi);  	}  } @@ -5340,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	sector_t stripe_addr;  	int reshape_sectors;  	struct list_head stripes; +	sector_t retn;  	if (sector_nr == 0) {  		/* If restarting in the middle, skip the initial sectors */ @@ -5347,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {  			sector_nr = raid5_size(mddev, 0, 0)  				- conf->reshape_progress; +		} else if (mddev->reshape_backwards && +			   conf->reshape_progress == MaxSector) { +			/* shouldn't happen, but just in case, finish up.*/ +			sector_nr = MaxSector;  		} else if (!mddev->reshape_backwards &&  			   conf->reshape_progress > 0)  			sector_nr = conf->reshape_progress; @@ -5355,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  			mddev->curr_resync_completed = sector_nr;  			sysfs_notify(&mddev->kobj, NULL, "sync_completed");  			*skipped = 1; -			return sector_nr; +			retn = sector_nr; +			goto finish;  		}  	} @@ -5363,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	 * If old and new chunk sizes differ, we need to process the  	 * largest of these  	 */ -	if (mddev->new_chunk_sectors > mddev->chunk_sectors) -		reshape_sectors = mddev->new_chunk_sectors; -	else -		reshape_sectors = mddev->chunk_sectors; + +	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);  	/* We update the metadata at least every 10 seconds, or when  	 * the data about to be copied would over-write the source of @@ -5381,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	safepos = conf->reshape_safe;  	sector_div(safepos, data_disks);  	if (mddev->reshape_backwards) { -		writepos -= min_t(sector_t, reshape_sectors, writepos); +		BUG_ON(writepos < reshape_sectors); +		writepos -= reshape_sectors;  		readpos += reshape_sectors;  		safepos += reshape_sectors;  	} else {  		writepos += reshape_sectors; +		/* readpos and safepos are worst-case calculations. +		 * A negative number is overly pessimistic, and causes +		 * obvious problems for unsigned storage.  So clip to 0. +		 */  		readpos -= min_t(sector_t, reshape_sectors, readpos);  		safepos -= min_t(sector_t, reshape_sectors, safepos);  	} @@ -5528,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  	 * then we need to write out the superblock.  	 */  	sector_nr += reshape_sectors; -	if ((sector_nr - mddev->curr_resync_completed) * 2 +	retn = reshape_sectors; +finish: +	if (mddev->curr_resync_completed > mddev->resync_max || +	    (sector_nr - mddev->curr_resync_completed) * 2  	    >= mddev->resync_max - mddev->curr_resync_completed) {  		/* Cannot proceed until we've updated the superblock... */  		wait_event(conf->wait_for_overlap, @@ -5553,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk  		sysfs_notify(&mddev->kobj, NULL, "sync_completed");  	}  ret: -	return reshape_sectors; +	return retn;  }  static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5707,7 +5702,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)  	if (remaining == 0) {  		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),  					 raid_bio, 0); -		bio_endio(raid_bio, 0); +		bio_endio(raid_bio);  	}  	if (atomic_dec_and_test(&conf->active_aligned_reads))  		wake_up(&conf->wait_for_quiescent); @@ -5809,6 +5804,18 @@ static void raid5d(struct md_thread *thread)  	md_check_recovery(mddev); +	if (!bio_list_empty(&conf->return_bi) && +	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { +		struct bio_list tmp = BIO_EMPTY_LIST; +		spin_lock_irq(&conf->device_lock); +		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { +			bio_list_merge(&tmp, &conf->return_bi); +			bio_list_init(&conf->return_bi); +		} +		spin_unlock_irq(&conf->device_lock); +		return_io(&tmp); +	} +  	blk_start_plug(&plug);  	handled = 0;  	spin_lock_irq(&conf->device_lock); @@ -5857,12 +5864,14 @@ static void raid5d(struct md_thread *thread)  	pr_debug("%d stripes handled\n", handled);  	spin_unlock_irq(&conf->device_lock); -	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { +	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && +	    mutex_trylock(&conf->cache_size_mutex)) {  		grow_one_stripe(conf, __GFP_NOWARN);  		/* Set flag even if allocation failed.  This helps  		 * slow down allocation requests when mem is short  		 */  		set_bit(R5_DID_ALLOC, &conf->cache_state); +		mutex_unlock(&conf->cache_size_mutex);  	}  	async_tx_issue_pending_all(); @@ -5894,18 +5903,22 @@ raid5_set_cache_size(struct mddev *mddev, int size)  		return -EINVAL;  	conf->min_nr_stripes = size; +	mutex_lock(&conf->cache_size_mutex);  	while (size < conf->max_nr_stripes &&  	       drop_one_stripe(conf))  		; +	mutex_unlock(&conf->cache_size_mutex);  	err = md_allow_write(mddev);  	if (err)  		return err; +	mutex_lock(&conf->cache_size_mutex);  	while (size > conf->max_nr_stripes)  		if (!grow_one_stripe(conf, GFP_KERNEL))  			break; +	mutex_unlock(&conf->cache_size_mutex);  	return 0;  } @@ -6243,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)  		/* size is defined by the smallest of previous and new size */  		raid_disks = min(conf->raid_disks, conf->previous_raid_disks); -	sectors &= ~((sector_t)mddev->chunk_sectors - 1); -	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); +	sectors &= ~((sector_t)conf->chunk_sectors - 1); +	sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);  	return sectors * (raid_disks - conf->max_degraded);  } @@ -6371,11 +6384,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,  				      struct shrink_control *sc)  {  	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); -	int ret = 0; -	while (ret < sc->nr_to_scan) { -		if (drop_one_stripe(conf) == 0) -			return SHRINK_STOP; -		ret++; +	unsigned long ret = SHRINK_STOP; + +	if (mutex_trylock(&conf->cache_size_mutex)) { +		ret= 0; +		while (ret < sc->nr_to_scan && +		       conf->max_nr_stripes > conf->min_nr_stripes) { +			if (drop_one_stripe(conf) == 0) { +				ret = SHRINK_STOP; +				break; +			} +			ret++; +		} +		mutex_unlock(&conf->cache_size_mutex);  	}  	return ret;  } @@ -6444,6 +6465,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)  		goto abort;  	spin_lock_init(&conf->device_lock);  	seqcount_init(&conf->gen_lock); +	mutex_init(&conf->cache_size_mutex);  	init_waitqueue_head(&conf->wait_for_quiescent);  	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {  		init_waitqueue_head(&conf->wait_for_stripe[i]); @@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	INIT_LIST_HEAD(&conf->hold_list);  	INIT_LIST_HEAD(&conf->delayed_list);  	INIT_LIST_HEAD(&conf->bitmap_list); +	bio_list_init(&conf->return_bi);  	init_llist_head(&conf->released_stripes);  	atomic_set(&conf->active_stripes, 0);  	atomic_set(&conf->preread_active_stripes, 0); @@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)  	if (conf->reshape_progress != MaxSector) {  		conf->prev_chunk_sectors = mddev->chunk_sectors;  		conf->prev_algo = mddev->layout; +	} else { +		conf->prev_chunk_sectors = conf->chunk_sectors; +		conf->prev_algo = conf->algorithm;  	}  	conf->min_nr_stripes = NR_STRIPES; @@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev)  		sector_t here_new, here_old;  		int old_disks;  		int max_degraded = (mddev->level == 6 ? 2 : 1); +		int chunk_sectors; +		int new_data_disks;  		if (mddev->new_level != mddev->level) {  			printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev)  		/* reshape_position must be on a new-stripe boundary, and one  		 * further up in new geometry must map after here in old  		 * geometry. +		 * If the chunk sizes are different, then as we perform reshape +		 * in units of the largest of the two, reshape_position needs +		 * be a multiple of the largest chunk size times new data disks.  		 */  		here_new = mddev->reshape_position; -		if (sector_div(here_new, mddev->new_chunk_sectors * -			       (mddev->raid_disks - max_degraded))) { +		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); +		new_data_disks = mddev->raid_disks - max_degraded; +		if (sector_div(here_new, chunk_sectors * new_data_disks)) {  			printk(KERN_ERR "md/raid:%s: reshape_position not "  			       "on a stripe boundary\n", mdname(mddev));  			return -EINVAL;  		} -		reshape_offset = here_new * mddev->new_chunk_sectors; +		reshape_offset = here_new * chunk_sectors;  		/* here_new is the stripe we will write to */  		here_old = mddev->reshape_position; -		sector_div(here_old, mddev->chunk_sectors * -			   (old_disks-max_degraded)); +		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));  		/* here_old is the first stripe that we might need to read  		 * from */  		if (mddev->delta_disks == 0) { -			if ((here_new * mddev->new_chunk_sectors != -			     here_old * mddev->chunk_sectors)) { -				printk(KERN_ERR "md/raid:%s: reshape position is" -				       " confused - aborting\n", mdname(mddev)); -				return -EINVAL; -			}  			/* We cannot be sure it is safe to start an in-place  			 * reshape.  It is only safe if user-space is monitoring  			 * and taking constant backups. @@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev)  				return -EINVAL;  			}  		} else if (mddev->reshape_backwards -		    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= -		       here_old * mddev->chunk_sectors) -		    : (here_new * mddev->new_chunk_sectors >= -		       here_old * mddev->chunk_sectors + (-min_offset_diff))) { +		    ? (here_new * chunk_sectors + min_offset_diff <= +		       here_old * chunk_sectors) +		    : (here_new * chunk_sectors >= +		       here_old * chunk_sectors + (-min_offset_diff))) {  			/* Reading from the same stripe as writing to - bad */  			printk(KERN_ERR "md/raid:%s: reshape_position too early for "  			       "auto-recovery - aborting.\n", @@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)  	int i;  	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, -		mddev->chunk_sectors / 2, mddev->layout); +		conf->chunk_sectors / 2, mddev->layout);  	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);  	for (i = 0; i < conf->raid_disks; i++)  		seq_printf (seq, "%s", @@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)  	 * worth it.  	 */  	sector_t newsize; -	sectors &= ~((sector_t)mddev->chunk_sectors - 1); +	struct r5conf *conf = mddev->private; + +	sectors &= ~((sector_t)conf->chunk_sectors - 1);  	newsize = raid5_size(mddev, sectors, mddev->raid_disks);  	if (mddev->external_size &&  	    mddev->array_sectors > newsize) @@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf)  			rdev->data_offset = rdev->new_data_offset;  		smp_wmb();  		conf->reshape_progress = MaxSector; +		conf->mddev->reshape_position = MaxSector;  		spin_unlock_irq(&conf->device_lock);  		wake_up(&conf->wait_for_overlap); @@ -7757,7 +7785,6 @@ static struct md_personality raid6_personality =  	.quiesce	= raid5_quiesce,  	.takeover	= raid6_takeover,  	.congested	= raid5_congested, -	.mergeable_bvec	= raid5_mergeable_bvec,  };  static struct md_personality raid5_personality =  { @@ -7781,7 +7808,6 @@ static struct md_personality raid5_personality =  	.quiesce	= raid5_quiesce,  	.takeover	= raid5_takeover,  	.congested	= raid5_congested, -	.mergeable_bvec	= raid5_mergeable_bvec,  };  static struct md_personality raid4_personality = @@ -7806,7 +7832,6 @@ static struct md_personality raid4_personality =  	.quiesce	= raid5_quiesce,  	.takeover	= raid4_takeover,  	.congested	= raid5_congested, -	.mergeable_bvec	= raid5_mergeable_bvec,  };  static int __init raid5_init(void)  |