diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/bitmap.c | 7 | ||||
| -rw-r--r-- | drivers/md/dm-crypt.c | 12 | ||||
| -rw-r--r-- | drivers/md/dm-ioctl.c | 17 | ||||
| -rw-r--r-- | drivers/md/dm-mpath.c | 4 | ||||
| -rw-r--r-- | drivers/md/dm-table.c | 16 | ||||
| -rw-r--r-- | drivers/md/dm.c | 43 | ||||
| -rw-r--r-- | drivers/md/md.c | 32 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 9 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 1 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 270 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 5 | 
11 files changed, 247 insertions, 169 deletions
| diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2bc56e2a3526..135a0907e9de 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -177,11 +177,16 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde  	 * nr_pending is 0 and In_sync is clear, the entries we return will  	 * still be in the same position on the list when we re-enter  	 * list_for_each_entry_continue_rcu. +	 * +	 * Note that if entered with 'rdev == NULL' to start at the +	 * beginning, we temporarily assign 'rdev' to an address which +	 * isn't really an rdev, but which can be used by +	 * list_for_each_entry_continue_rcu() to find the first entry.  	 */  	rcu_read_lock();  	if (rdev == NULL)  		/* start at the beginning */ -		rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); +		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);  	else {  		/* release the previous rdev and start from there. */  		rdev_dec_pending(rdev, mddev); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9eeea196328a..5503e43e5f28 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -925,10 +925,11 @@ static int crypt_convert(struct crypt_config *cc,  		switch (r) {  		/* async */ -		case -EINPROGRESS:  		case -EBUSY:  			wait_for_completion(&ctx->restart);  			reinit_completion(&ctx->restart); +			/* fall through*/ +		case -EINPROGRESS:  			ctx->req = NULL;  			ctx->cc_sector++;  			continue; @@ -1345,8 +1346,10 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,  	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);  	struct crypt_config *cc = io->cc; -	if (error == -EINPROGRESS) +	if (error == -EINPROGRESS) { +		complete(&ctx->restart);  		return; +	}  	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)  		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); @@ -1357,15 +1360,12 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,  	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);  	if (!atomic_dec_and_test(&ctx->cc_pending)) -		goto done; +		return;  	if (bio_data_dir(io->base_bio) == READ)  		kcryptd_crypt_read_done(io);  	else  		kcryptd_crypt_write_io_submit(io, 1); -done: -	if (!completion_done(&ctx->restart)) -		complete(&ctx->restart);  }  static void kcryptd_crypt(struct work_struct *work) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c8a18e4ee9dc..720ceeb7fa9b 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1298,21 +1298,22 @@ static int table_load(struct dm_ioctl *param, size_t param_size)  		goto err_unlock_md_type;  	} -	if (dm_get_md_type(md) == DM_TYPE_NONE) +	if (dm_get_md_type(md) == DM_TYPE_NONE) {  		/* Initial table load: acquire type of table. */  		dm_set_md_type(md, dm_table_get_type(t)); -	else if (dm_get_md_type(md) != dm_table_get_type(t)) { + +		/* setup md->queue to reflect md's type (may block) */ +		r = dm_setup_md_queue(md); +		if (r) { +			DMWARN("unable to set up device queue for new table."); +			goto err_unlock_md_type; +		} +	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {  		DMWARN("can't change device type after initial table load.");  		r = -EINVAL;  		goto err_unlock_md_type;  	} -	/* setup md->queue to reflect md's type (may block) */ -	r = dm_setup_md_queue(md); -	if (r) { -		DMWARN("unable to set up device queue for new table."); -		goto err_unlock_md_type; -	}  	dm_unlock_md_type(md);  	/* stage inactive table */ diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 63953477a07c..eff7bdd7731d 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -429,9 +429,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,  		/* blk-mq request-based interface */  		*__clone = blk_get_request(bdev_get_queue(bdev),  					   rq_data_dir(rq), GFP_ATOMIC); -		if (IS_ERR(*__clone)) +		if (IS_ERR(*__clone)) {  			/* ENOMEM, requeue */ +			clear_mapinfo(m, map_context);  			return r; +		}  		(*__clone)->bio = (*__clone)->biotail = NULL;  		(*__clone)->rq_disk = bdev->bd_disk;  		(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d9b00b8565c6..16ba55ad7089 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -820,6 +820,12 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)  }  EXPORT_SYMBOL(dm_consume_args); +static bool __table_type_request_based(unsigned table_type) +{ +	return (table_type == DM_TYPE_REQUEST_BASED || +		table_type == DM_TYPE_MQ_REQUEST_BASED); +} +  static int dm_table_set_type(struct dm_table *t)  {  	unsigned i; @@ -852,8 +858,7 @@ static int dm_table_set_type(struct dm_table *t)  		 * Determine the type from the live device.  		 * Default to bio-based if device is new.  		 */ -		if (live_md_type == DM_TYPE_REQUEST_BASED || -		    live_md_type == DM_TYPE_MQ_REQUEST_BASED) +		if (__table_type_request_based(live_md_type))  			request_based = 1;  		else  			bio_based = 1; @@ -903,7 +908,7 @@ static int dm_table_set_type(struct dm_table *t)  			}  		t->type = DM_TYPE_MQ_REQUEST_BASED; -	} else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) { +	} else if (list_empty(devices) && __table_type_request_based(live_md_type)) {  		/* inherit live MD type */  		t->type = live_md_type; @@ -925,10 +930,7 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)  bool dm_table_request_based(struct dm_table *t)  { -	unsigned table_type = dm_table_get_type(t); - -	return (table_type == DM_TYPE_REQUEST_BASED || -		table_type == DM_TYPE_MQ_REQUEST_BASED); +	return __table_type_request_based(dm_table_get_type(t));  }  bool dm_table_mq_request_based(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f8c7ca3e8947..2caf492890d6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1089,11 +1089,17 @@ static void free_rq_clone(struct request *clone)  	blk_rq_unprep_clone(clone); -	if (clone->q->mq_ops) +	if (md->type == DM_TYPE_MQ_REQUEST_BASED) +		/* stacked on blk-mq queue(s) */  		tio->ti->type->release_clone_rq(clone);  	else if (!md->queue->mq_ops)  		/* request_fn queue stacked on request_fn queue(s) */  		free_clone_request(md, clone); +	/* +	 * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: +	 * no need to call free_clone_request() because we leverage blk-mq by +	 * allocating the clone at the end of the blk-mq pdu (see: clone_rq) +	 */  	if (!md->queue->mq_ops)  		free_rq_tio(tio); @@ -1156,6 +1162,7 @@ static void old_requeue_request(struct request *rq)  	spin_lock_irqsave(q->queue_lock, flags);  	blk_requeue_request(q, rq); +	blk_run_queue_async(q);  	spin_unlock_irqrestore(q->queue_lock, flags);  } @@ -1716,8 +1723,7 @@ static int dm_merge_bvec(struct request_queue *q,  	struct mapped_device *md = q->queuedata;  	struct dm_table *map = dm_get_live_table_fast(md);  	struct dm_target *ti; -	sector_t max_sectors; -	int max_size = 0; +	sector_t max_sectors, max_size = 0;  	if (unlikely(!map))  		goto out; @@ -1732,8 +1738,16 @@ static int dm_merge_bvec(struct request_queue *q,  	max_sectors = min(max_io_len(bvm->bi_sector, ti),  			  (sector_t) queue_max_sectors(q));  	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; -	if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ -		max_size = 0; + +	/* +	 * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t +	 * to the targets' merge function since it holds sectors not bytes). +	 * Just doing this as an interim fix for stable@ because the more +	 * comprehensive cleanup of switching to sector_t will impact every +	 * DM target that implements a ->merge hook. +	 */ +	if (max_size > INT_MAX) +		max_size = INT_MAX;  	/*  	 * merge_bvec_fn() returns number of bytes @@ -1741,7 +1755,7 @@ static int dm_merge_bvec(struct request_queue *q,  	 * max is precomputed maximal io size  	 */  	if (max_size && ti->type->merge) -		max_size = ti->type->merge(ti, bvm, biovec, max_size); +		max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);  	/*  	 * If the target doesn't support merge method and some of the devices  	 * provided their merge_bvec method (we know this by looking for the @@ -1963,8 +1977,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,  			dm_kill_unmapped_request(rq, r);  			return r;  		} -		if (IS_ERR(clone)) -			return DM_MAPIO_REQUEUE; +		if (r != DM_MAPIO_REMAPPED) +			return r;  		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {  			/* -ENOMEM */  			ti->type->release_clone_rq(clone); @@ -2662,9 +2676,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)  {  	struct request_queue *q = NULL; -	if (md->queue->elevator) -		return 0; -  	/* Fully initialize the queue */  	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);  	if (!q) @@ -2748,13 +2759,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,  	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {  		/* clone request is allocated at the end of the pdu */  		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); -		if (!clone_rq(rq, md, tio, GFP_ATOMIC)) -			return BLK_MQ_RQ_QUEUE_BUSY; +		(void) clone_rq(rq, md, tio, GFP_ATOMIC);  		queue_kthread_work(&md->kworker, &tio->work);  	} else {  		/* Direct call is fine since .queue_rq allows allocations */ -		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) -			dm_requeue_unmapped_original_request(md, rq); +		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { +			/* Undo dm_start_request() before requeuing */ +			rq_completed(md, rq_data_dir(rq), false); +			return BLK_MQ_RQ_QUEUE_BUSY; +		}  	}  	return BLK_MQ_RQ_QUEUE_OK; diff --git a/drivers/md/md.c b/drivers/md/md.c index d4f31e195e26..4dbed4a67aaf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3834,7 +3834,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)  				err = -EBUSY;  		}  		spin_unlock(&mddev->lock); -		return err; +		return err ?: len;  	}  	err = mddev_lock(mddev);  	if (err) @@ -4211,34 +4211,36 @@ action_store(struct mddev *mddev, const char *page, size_t len)  	if (!mddev->pers || !mddev->pers->sync_request)  		return -EINVAL; -	if (cmd_match(page, "frozen")) -		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); -	else -		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { -		flush_workqueue(md_misc_wq); -		if (mddev->sync_thread) { -			set_bit(MD_RECOVERY_INTR, &mddev->recovery); -			if (mddev_lock(mddev) == 0) { +		if (cmd_match(page, "frozen")) +			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +		else +			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && +		    mddev_lock(mddev) == 0) { +			flush_workqueue(md_misc_wq); +			if (mddev->sync_thread) { +				set_bit(MD_RECOVERY_INTR, &mddev->recovery);  				md_reap_sync_thread(mddev); -				mddev_unlock(mddev);  			} +			mddev_unlock(mddev);  		}  	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||  		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))  		return -EBUSY;  	else if (cmd_match(page, "resync")) -		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  	else if (cmd_match(page, "recover")) { +		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); -		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);  	} else if (cmd_match(page, "reshape")) {  		int err;  		if (mddev->pers->start_reshape == NULL)  			return -EINVAL;  		err = mddev_lock(mddev);  		if (!err) { +			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  			err = mddev->pers->start_reshape(mddev);  			mddev_unlock(mddev);  		} @@ -4250,6 +4252,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)  			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);  		else if (!cmd_match(page, "repair"))  			return -EINVAL; +		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);  		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);  		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);  	} @@ -4818,12 +4821,12 @@ static void md_free(struct kobject *ko)  	if (mddev->sysfs_state)  		sysfs_put(mddev->sysfs_state); +	if (mddev->queue) +		blk_cleanup_queue(mddev->queue);  	if (mddev->gendisk) {  		del_gendisk(mddev->gendisk);  		put_disk(mddev->gendisk);  	} -	if (mddev->queue) -		blk_cleanup_queue(mddev->queue);  	kfree(mddev);  } @@ -8259,6 +8262,7 @@ void md_reap_sync_thread(struct mddev *mddev)  	if (mddev_is_clustered(mddev))  		md_cluster_ops->metadata_update_finish(mddev);  	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); +	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);  	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);  	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2cb59a641cd2..efb654eb5399 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -188,8 +188,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)  		}  		dev[j] = rdev1; -		disk_stack_limits(mddev->gendisk, rdev1->bdev, -				  rdev1->data_offset << 9); +		if (mddev->queue) +			disk_stack_limits(mddev->gendisk, rdev1->bdev, +					  rdev1->data_offset << 9);  		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)  			conf->has_merge_bvec = 1; @@ -523,6 +524,9 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)  			 ? (sector & (chunk_sects-1))  			 : sector_div(sector, chunk_sects)); +		/* Restore due to sector_div */ +		sector = bio->bi_iter.bi_sector; +  		if (sectors < bio_sectors(bio)) {  			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);  			bio_chain(split, bio); @@ -530,7 +534,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)  			split = bio;  		} -		sector = bio->bi_iter.bi_sector;  		zone = find_zone(mddev->private, §or);  		tmp_dev = map_sector(mddev, zone, sector, §or);  		split->bi_bdev = tmp_dev->bdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e793ab6b3570..f55c3f35b746 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4156,6 +4156,7 @@ static int raid10_start_reshape(struct mddev *mddev)  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);  	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); +	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);  	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);  	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 77dfd720aaa0..b6793d2e051f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)  static bool stripe_can_batch(struct stripe_head *sh)  {  	return test_bit(STRIPE_BATCH_READY, &sh->state) && +		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&  		is_full_stripe_write(sh);  } @@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh  		    < IO_THRESHOLD)  			md_wakeup_thread(conf->mddev->thread); +	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { +		int seq = sh->bm_seq; +		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && +		    sh->batch_head->bm_seq > seq) +			seq = sh->batch_head->bm_seq; +		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); +		sh->batch_head->bm_seq = seq; +	} +  	atomic_inc(&sh->count);  unlock_out:  	unlock_two_stripes(head, sh); @@ -1078,9 +1088,6 @@ again:  			pr_debug("skip op %ld on disc %d for sector %llu\n",  				bi->bi_rw, i, (unsigned long long)sh->sector);  			clear_bit(R5_LOCKED, &sh->dev[i].flags); -			if (sh->batch_head) -				set_bit(STRIPE_BATCH_ERR, -					&sh->batch_head->state);  			set_bit(STRIPE_HANDLE, &sh->state);  		} @@ -1825,7 +1832,7 @@ again:  	} else  		init_async_submit(&submit, 0, tx, NULL, NULL,  				  to_addr_conv(sh, percpu, j)); -	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit); +	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);  	if (!last_stripe) {  		j++;  		sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -1971,17 +1978,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)  	put_cpu();  } +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +{ +	struct stripe_head *sh; + +	sh = kmem_cache_zalloc(sc, gfp); +	if (sh) { +		spin_lock_init(&sh->stripe_lock); +		spin_lock_init(&sh->batch_lock); +		INIT_LIST_HEAD(&sh->batch_list); +		INIT_LIST_HEAD(&sh->lru); +		atomic_set(&sh->count, 1); +	} +	return sh; +}  static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)  {  	struct stripe_head *sh; -	sh = kmem_cache_zalloc(conf->slab_cache, gfp); + +	sh = alloc_stripe(conf->slab_cache, gfp);  	if (!sh)  		return 0;  	sh->raid_conf = conf; -	spin_lock_init(&sh->stripe_lock); -  	if (grow_buffers(sh, gfp)) {  		shrink_buffers(sh);  		kmem_cache_free(conf->slab_cache, sh); @@ -1990,13 +2010,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)  	sh->hash_lock_index =  		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;  	/* we just created an active stripe so... */ -	atomic_set(&sh->count, 1);  	atomic_inc(&conf->active_stripes); -	INIT_LIST_HEAD(&sh->lru); -	spin_lock_init(&sh->batch_lock); -	INIT_LIST_HEAD(&sh->batch_list); -	sh->batch_head = NULL;  	release_stripe(sh);  	conf->max_nr_stripes++;  	return 1; @@ -2060,6 +2075,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)  	return ret;  } +static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) +{ +	unsigned long cpu; +	int err = 0; + +	mddev_suspend(conf->mddev); +	get_online_cpus(); +	for_each_present_cpu(cpu) { +		struct raid5_percpu *percpu; +		struct flex_array *scribble; + +		percpu = per_cpu_ptr(conf->percpu, cpu); +		scribble = scribble_alloc(new_disks, +					  new_sectors / STRIPE_SECTORS, +					  GFP_NOIO); + +		if (scribble) { +			flex_array_free(percpu->scribble); +			percpu->scribble = scribble; +		} else { +			err = -ENOMEM; +			break; +		} +	} +	put_online_cpus(); +	mddev_resume(conf->mddev); +	return err; +} +  static int resize_stripes(struct r5conf *conf, int newsize)  {  	/* Make all the stripes able to hold 'newsize' devices. @@ -2088,7 +2132,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	struct stripe_head *osh, *nsh;  	LIST_HEAD(newstripes);  	struct disk_info *ndisks; -	unsigned long cpu;  	int err;  	struct kmem_cache *sc;  	int i; @@ -2109,13 +2152,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)  		return -ENOMEM;  	for (i = conf->max_nr_stripes; i; i--) { -		nsh = kmem_cache_zalloc(sc, GFP_KERNEL); +		nsh = alloc_stripe(sc, GFP_KERNEL);  		if (!nsh)  			break;  		nsh->raid_conf = conf; -		spin_lock_init(&nsh->stripe_lock); -  		list_add(&nsh->lru, &newstripes);  	}  	if (i) { @@ -2142,13 +2183,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)  				    lock_device_hash_lock(conf, hash));  		osh = get_free_stripe(conf, hash);  		unlock_device_hash_lock(conf, hash); -		atomic_set(&nsh->count, 1); +  		for(i=0; i<conf->pool_size; i++) {  			nsh->dev[i].page = osh->dev[i].page;  			nsh->dev[i].orig_page = osh->dev[i].page;  		} -		for( ; i<newsize; i++) -			nsh->dev[i].page = NULL;  		nsh->hash_lock_index = hash;  		kmem_cache_free(conf->slab_cache, osh);  		cnt++; @@ -2174,25 +2213,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	} else  		err = -ENOMEM; -	get_online_cpus(); -	for_each_present_cpu(cpu) { -		struct raid5_percpu *percpu; -		struct flex_array *scribble; - -		percpu = per_cpu_ptr(conf->percpu, cpu); -		scribble = scribble_alloc(newsize, conf->chunk_sectors / -			STRIPE_SECTORS, GFP_NOIO); - -		if (scribble) { -			flex_array_free(percpu->scribble); -			percpu->scribble = scribble; -		} else { -			err = -ENOMEM; -			break; -		} -	} -	put_online_cpus(); -  	/* Step 4, return new stripes to service */  	while(!list_empty(&newstripes)) {  		nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2212,7 +2232,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)  	conf->slab_cache = sc;  	conf->active_name = 1-conf->active_name; -	conf->pool_size = newsize; +	if (!err) +		conf->pool_size = newsize;  	return err;  } @@ -2434,7 +2455,7 @@ static void raid5_end_write_request(struct bio *bi, int error)  	}  	rdev_dec_pending(rdev, conf->mddev); -	if (sh->batch_head && !uptodate) +	if (sh->batch_head && !uptodate && !replacement)  		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);  	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) @@ -2976,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,  	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",  		(unsigned long long)(*bip)->bi_iter.bi_sector,  		(unsigned long long)sh->sector, dd_idx); -	spin_unlock_irq(&sh->stripe_lock);  	if (conf->mddev->bitmap && firstwrite) { +		/* Cannot hold spinlock over bitmap_startwrite, +		 * but must ensure this isn't added to a batch until +		 * we have added to the bitmap and set bm_seq. +		 * So set STRIPE_BITMAP_PENDING to prevent +		 * batching. +		 * If multiple add_stripe_bio() calls race here they +		 * much all set STRIPE_BITMAP_PENDING.  So only the first one +		 * to complete "bitmap_startwrite" gets to set +		 * STRIPE_BIT_DELAY.  This is important as once a stripe +		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed +		 * any more. +		 */ +		set_bit(STRIPE_BITMAP_PENDING, &sh->state); +		spin_unlock_irq(&sh->stripe_lock);  		bitmap_startwrite(conf->mddev->bitmap, sh->sector,  				  STRIPE_SECTORS, 0); -		sh->bm_seq = conf->seq_flush+1; -		set_bit(STRIPE_BIT_DELAY, &sh->state); +		spin_lock_irq(&sh->stripe_lock); +		clear_bit(STRIPE_BITMAP_PENDING, &sh->state); +		if (!sh->batch_head) { +			sh->bm_seq = conf->seq_flush+1; +			set_bit(STRIPE_BIT_DELAY, &sh->state); +		}  	} +	spin_unlock_irq(&sh->stripe_lock);  	if (stripe_can_batch(sh))  		stripe_add_to_batch_list(conf, sh); @@ -3278,7 +3317,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,  		/* reconstruct-write isn't being forced */  		return 0;  	for (i = 0; i < s->failed; i++) { -		if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && +		if (s->failed_num[i] != sh->pd_idx && +		    s->failed_num[i] != sh->qd_idx && +		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&  		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))  			return 1;  	} @@ -3298,6 +3339,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,  		 */  		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));  		BUG_ON(test_bit(R5_Wantread, &dev->flags)); +		BUG_ON(sh->batch_head);  		if ((s->uptodate == disks - 1) &&  		    (s->failed && (disk_idx == s->failed_num[0] ||  				   disk_idx == s->failed_num[1]))) { @@ -3366,7 +3408,6 @@ static void handle_stripe_fill(struct stripe_head *sh,  {  	int i; -	BUG_ON(sh->batch_head);  	/* look for blocks to read/compute, skip this if a compute  	 * is already in flight, or if the stripe contents are in the  	 * midst of changing due to a write @@ -3379,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,  	set_bit(STRIPE_HANDLE, &sh->state);  } +static void break_stripe_batch_list(struct stripe_head *head_sh, +				    unsigned long handle_flags);  /* handle_stripe_clean_event   * any written block on an uptodate or failed drive can be returned.   * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but @@ -3392,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,  	int discard_pending = 0;  	struct stripe_head *head_sh = sh;  	bool do_endio = false; -	int wakeup_nr = 0;  	for (i = disks; i--; )  		if (sh->dev[i].written) { @@ -3481,44 +3523,8 @@ unhash:  		if (atomic_dec_and_test(&conf->pending_full_writes))  			md_wakeup_thread(conf->mddev->thread); -	if (!head_sh->batch_head || !do_endio) -		return; -	for (i = 0; i < head_sh->disks; i++) { -		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) -			wakeup_nr++; -	} -	while (!list_empty(&head_sh->batch_list)) { -		int i; -		sh = list_first_entry(&head_sh->batch_list, -				      struct stripe_head, batch_list); -		list_del_init(&sh->batch_list); - -		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, -			      head_sh->state & ~((1 << STRIPE_ACTIVE) | -						 (1 << STRIPE_PREREAD_ACTIVE) | -						 STRIPE_EXPAND_SYNC_FLAG)); -		sh->check_state = head_sh->check_state; -		sh->reconstruct_state = head_sh->reconstruct_state; -		for (i = 0; i < sh->disks; i++) { -			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) -				wakeup_nr++; -			sh->dev[i].flags = head_sh->dev[i].flags; -		} - -		spin_lock_irq(&sh->stripe_lock); -		sh->batch_head = NULL; -		spin_unlock_irq(&sh->stripe_lock); -		if (sh->state & STRIPE_EXPAND_SYNC_FLAG) -			set_bit(STRIPE_HANDLE, &sh->state); -		release_stripe(sh); -	} - -	spin_lock_irq(&head_sh->stripe_lock); -	head_sh->batch_head = NULL; -	spin_unlock_irq(&head_sh->stripe_lock); -	wake_up_nr(&conf->wait_for_overlap, wakeup_nr); -	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) -		set_bit(STRIPE_HANDLE, &head_sh->state); +	if (head_sh->batch_head && do_endio) +		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);  }  static void handle_stripe_dirtying(struct r5conf *conf, @@ -4159,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)  static int clear_batch_ready(struct stripe_head *sh)  { +	/* Return '1' if this is a member of batch, or +	 * '0' if it is a lone stripe or a head which can now be +	 * handled. +	 */  	struct stripe_head *tmp;  	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) -		return 0; +		return (sh->batch_head && sh->batch_head != sh);  	spin_lock(&sh->stripe_lock);  	if (!sh->batch_head) {  		spin_unlock(&sh->stripe_lock); @@ -4189,46 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh)  	return 0;  } -static void check_break_stripe_batch_list(struct stripe_head *sh) +static void break_stripe_batch_list(struct stripe_head *head_sh, +				    unsigned long handle_flags)  { -	struct stripe_head *head_sh, *next; +	struct stripe_head *sh, *next;  	int i; +	int do_wakeup = 0; -	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) -		return; +	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { -	head_sh = sh; -	do { -		sh = list_first_entry(&sh->batch_list, -				      struct stripe_head, batch_list); -		BUG_ON(sh == head_sh); -	} while (!test_bit(STRIPE_DEGRADED, &sh->state)); - -	while (sh != head_sh) { -		next = list_first_entry(&sh->batch_list, -					struct stripe_head, batch_list);  		list_del_init(&sh->batch_list); -		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, -			      head_sh->state & ~((1 << STRIPE_ACTIVE) | -						 (1 << STRIPE_PREREAD_ACTIVE) | -						 (1 << STRIPE_DEGRADED) | -						 STRIPE_EXPAND_SYNC_FLAG)); +		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | +					  (1 << STRIPE_SYNCING) | +					  (1 << STRIPE_REPLACED) | +					  (1 << STRIPE_PREREAD_ACTIVE) | +					  (1 << STRIPE_DELAYED) | +					  (1 << STRIPE_BIT_DELAY) | +					  (1 << STRIPE_FULL_WRITE) | +					  (1 << STRIPE_BIOFILL_RUN) | +					  (1 << STRIPE_COMPUTE_RUN)  | +					  (1 << STRIPE_OPS_REQ_PENDING) | +					  (1 << STRIPE_DISCARD) | +					  (1 << STRIPE_BATCH_READY) | +					  (1 << STRIPE_BATCH_ERR) | +					  (1 << STRIPE_BITMAP_PENDING))); +		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | +					      (1 << STRIPE_REPLACED))); + +		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | +					    (1 << STRIPE_DEGRADED)), +			      head_sh->state & (1 << STRIPE_INSYNC)); +  		sh->check_state = head_sh->check_state;  		sh->reconstruct_state = head_sh->reconstruct_state; -		for (i = 0; i < sh->disks; i++) +		for (i = 0; i < sh->disks; i++) { +			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) +				do_wakeup = 1;  			sh->dev[i].flags = head_sh->dev[i].flags &  				(~((1 << R5_WriteError) | (1 << R5_Overlap))); - +		}  		spin_lock_irq(&sh->stripe_lock);  		sh->batch_head = NULL;  		spin_unlock_irq(&sh->stripe_lock); - -		set_bit(STRIPE_HANDLE, &sh->state); +		if (handle_flags == 0 || +		    sh->state & handle_flags) +			set_bit(STRIPE_HANDLE, &sh->state);  		release_stripe(sh); - -		sh = next;  	} +	spin_lock_irq(&head_sh->stripe_lock); +	head_sh->batch_head = NULL; +	spin_unlock_irq(&head_sh->stripe_lock); +	for (i = 0; i < head_sh->disks; i++) +		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) +			do_wakeup = 1; +	if (head_sh->state & handle_flags) +		set_bit(STRIPE_HANDLE, &head_sh->state); + +	if (do_wakeup) +		wake_up(&head_sh->raid_conf->wait_for_overlap);  }  static void handle_stripe(struct stripe_head *sh) @@ -4253,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh)  		return;  	} -	check_break_stripe_batch_list(sh); +	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) +		break_stripe_batch_list(sh, 0);  	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {  		spin_lock(&sh->stripe_lock); @@ -4307,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)  	if (s.failed > conf->max_degraded) {  		sh->check_state = 0;  		sh->reconstruct_state = 0; +		break_stripe_batch_list(sh, 0);  		if (s.to_read+s.to_write+s.written)  			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);  		if (s.syncing + s.replacing) @@ -6221,8 +6252,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu  		percpu->spare_page = alloc_page(GFP_KERNEL);  	if (!percpu->scribble)  		percpu->scribble = scribble_alloc(max(conf->raid_disks, -			conf->previous_raid_disks), conf->chunk_sectors / -			STRIPE_SECTORS, GFP_KERNEL); +						      conf->previous_raid_disks), +						  max(conf->chunk_sectors, +						      conf->prev_chunk_sectors) +						   / STRIPE_SECTORS, +						  GFP_KERNEL);  	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {  		free_scratch_buffer(conf, percpu); @@ -7198,6 +7232,15 @@ static int check_reshape(struct mddev *mddev)  	if (!check_stripe_cache(mddev))  		return -ENOSPC; +	if (mddev->new_chunk_sectors > mddev->chunk_sectors || +	    mddev->delta_disks > 0) +		if (resize_chunks(conf, +				  conf->previous_raid_disks +				  + max(0, mddev->delta_disks), +				  max(mddev->new_chunk_sectors, +				      mddev->chunk_sectors) +			    ) < 0) +			return -ENOMEM;  	return resize_stripes(conf, (conf->previous_raid_disks  				     + mddev->delta_disks));  } @@ -7311,6 +7354,7 @@ static int raid5_start_reshape(struct mddev *mddev)  	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);  	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); +	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);  	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);  	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);  	mddev->sync_thread = md_register_thread(md_do_sync, mddev, diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 7dc0dd86074b..896d603ad0da 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -337,9 +337,12 @@ enum {  	STRIPE_ON_RELEASE_LIST,  	STRIPE_BATCH_READY,  	STRIPE_BATCH_ERR, +	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add +				 * to batch yet. +				 */  }; -#define STRIPE_EXPAND_SYNC_FLAG \ +#define STRIPE_EXPAND_SYNC_FLAGS \  	((1 << STRIPE_EXPAND_SOURCE) |\  	(1 << STRIPE_EXPAND_READY) |\  	(1 << STRIPE_EXPANDING) |\ |