diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
| -rw-r--r-- | fs/btrfs/scrub.c | 1836 | 
1 files changed, 1333 insertions, 503 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69b..bdbb94f245c9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@  /* - * Copyright (C) 2011 STRATO.  All rights reserved. + * Copyright (C) 2011, 2012 STRATO.  All rights reserved.   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public @@ -25,6 +25,7 @@  #include "transaction.h"  #include "backref.h"  #include "extent_io.h" +#include "dev-replace.h"  #include "check-integrity.h"  #include "rcu-string.h" @@ -42,10 +43,23 @@   */  struct scrub_block; -struct scrub_dev; +struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */ -#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */  #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */  struct scrub_page { @@ -56,6 +70,8 @@ struct scrub_page {  	u64			generation;  	u64			logical;  	u64			physical; +	u64			physical_for_dev_replace; +	atomic_t		ref_count;  	struct {  		unsigned int	mirror_num:8;  		unsigned int	have_csum:1; @@ -66,23 +82,28 @@ struct scrub_page {  struct scrub_bio {  	int			index; -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx; +	struct btrfs_device	*dev;  	struct bio		*bio;  	int			err;  	u64			logical;  	u64			physical; -	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO +	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO]; +#else +	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif  	int			page_count;  	int			next_free;  	struct btrfs_work	work;  };  struct scrub_block { -	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK]; +	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];  	int			page_count;  	atomic_t		outstanding_pages;  	atomic_t		ref_count; /* free mem on transition to zero */ -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx;  	struct {  		unsigned int	header_error:1;  		unsigned int	checksum_error:1; @@ -91,23 +112,35 @@ struct scrub_block {  	};  }; -struct scrub_dev { -	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV]; -	struct btrfs_device	*dev; +struct scrub_wr_ctx { +	struct scrub_bio *wr_curr_bio; +	struct btrfs_device *tgtdev; +	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ +	atomic_t flush_all_writes; +	struct mutex wr_lock; +}; + +struct scrub_ctx { +	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX]; +	struct btrfs_root	*dev_root;  	int			first_free;  	int			curr; -	atomic_t		in_flight; -	atomic_t		fixup_cnt; +	atomic_t		bios_in_flight; +	atomic_t		workers_pending;  	spinlock_t		list_lock;  	wait_queue_head_t	list_wait;  	u16			csum_size;  	struct list_head	csum_list;  	atomic_t		cancel_req;  	int			readonly; -	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ +	int			pages_per_rd_bio;  	u32			sectorsize;  	u32			nodesize;  	u32			leafsize; + +	int			is_dev_replace; +	struct scrub_wr_ctx	wr_ctx; +  	/*  	 * statistics  	 */ @@ -116,13 +149,23 @@ struct scrub_dev {  };  struct scrub_fixup_nodatasum { -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx; +	struct btrfs_device	*dev;  	u64			logical;  	struct btrfs_root	*root;  	struct btrfs_work	work;  	int			mirror_num;  }; +struct scrub_copy_nocow_ctx { +	struct scrub_ctx	*sctx; +	u64			logical; +	u64			len; +	int			mirror_num; +	u64			physical_for_dev_replace; +	struct btrfs_work	work; +}; +  struct scrub_warning {  	struct btrfs_path	*path;  	u64			extent_item_size; @@ -137,15 +180,20 @@ struct scrub_warning {  }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, -				     struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, +				     struct btrfs_fs_info *fs_info, +				     struct scrub_block *original_sblock,  				     u64 length, u64 logical, -				     struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, -			       struct scrub_block *sblock, int is_metadata, -			       int have_csum, u8 *csum, u64 generation, -			       u16 csum_size); +				     struct scrub_block *sblocks_for_recheck); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +				struct scrub_block *sblock, int is_metadata, +				int have_csum, u8 *csum, u64 generation, +				u16 csum_size);  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  					 struct scrub_block *sblock,  					 int is_metadata, int have_csum, @@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,  static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  					    struct scrub_block *sblock_good,  					    int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, +					   int page_num);  static int scrub_checksum_data(struct scrub_block *sblock);  static int scrub_checksum_tree_block(struct scrub_block *sblock);  static int scrub_checksum_super(struct scrub_block *sblock);  static void scrub_block_get(struct scrub_block *sblock);  static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, -				 struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, -		       u64 physical, u64 flags, u64 gen, int mirror_num, -		       u8 *csum, int force); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage); +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +		       u64 physical, struct btrfs_device *dev, u64 flags, +		       u64 gen, int mirror_num, u8 *csum, int force, +		       u64 physical_for_dev_replace);  static void scrub_bio_end_io(struct bio *bio, int err);  static void scrub_bio_end_io_worker(struct btrfs_work *work);  static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, +			       u64 extent_logical, u64 extent_len, +			       u64 *extent_physical, +			       struct btrfs_device **extent_dev, +			       int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, +			      struct scrub_wr_ctx *wr_ctx, +			      struct btrfs_fs_info *fs_info, +			      struct btrfs_device *dev, +			      int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, +			    u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, +				      void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +			    int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ +	atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ +	atomic_dec(&sctx->bios_in_flight); +	wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + +	/* +	 * increment scrubs_running to prevent cancel requests from +	 * completing as long as a worker is running. we must also +	 * increment scrubs_paused to prevent deadlocking on pause +	 * requests used for transactions commits (as the worker uses a +	 * transaction context). it is safe to regard the worker +	 * as paused for all matters practical. effectively, we only +	 * avoid cancellation requests from completing. +	 */ +	mutex_lock(&fs_info->scrub_lock); +	atomic_inc(&fs_info->scrubs_running); +	atomic_inc(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	atomic_inc(&sctx->workers_pending); +} +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; -static void scrub_free_csums(struct scrub_dev *sdev) +	/* +	 * see scrub_pending_trans_workers_inc() why we're pretending +	 * to be paused in the scrub counters +	 */ +	mutex_lock(&fs_info->scrub_lock); +	atomic_dec(&fs_info->scrubs_running); +	atomic_dec(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	atomic_dec(&sctx->workers_pending); +	wake_up(&fs_info->scrub_pause_wait); +	wake_up(&sctx->list_wait); +} + +static void scrub_free_csums(struct scrub_ctx *sctx)  { -	while (!list_empty(&sdev->csum_list)) { +	while (!list_empty(&sctx->csum_list)) {  		struct btrfs_ordered_sum *sum; -		sum = list_first_entry(&sdev->csum_list, +		sum = list_first_entry(&sctx->csum_list,  				       struct btrfs_ordered_sum, list);  		list_del(&sum->list);  		kfree(sum);  	}  } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)  {  	int i; -	if (!sdev) +	if (!sctx)  		return; +	scrub_free_wr_ctx(&sctx->wr_ctx); +  	/* this can happen when scrub is cancelled */ -	if (sdev->curr != -1) { -		struct scrub_bio *sbio = sdev->bios[sdev->curr]; +	if (sctx->curr != -1) { +		struct scrub_bio *sbio = sctx->bios[sctx->curr];  		for (i = 0; i < sbio->page_count; i++) { -			BUG_ON(!sbio->pagev[i]); -			BUG_ON(!sbio->pagev[i]->page); +			WARN_ON(!sbio->pagev[i]->page);  			scrub_block_put(sbio->pagev[i]->sblock);  		}  		bio_put(sbio->bio);  	} -	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { -		struct scrub_bio *sbio = sdev->bios[i]; +	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { +		struct scrub_bio *sbio = sctx->bios[i];  		if (!sbio)  			break;  		kfree(sbio);  	} -	scrub_free_csums(sdev); -	kfree(sdev); +	scrub_free_csums(sctx); +	kfree(sctx);  }  static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)  { -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	int		i;  	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; -	int pages_per_bio; +	int pages_per_rd_bio; +	int ret; -	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, -			      bio_get_nr_vecs(dev->bdev)); -	sdev = kzalloc(sizeof(*sdev), GFP_NOFS); -	if (!sdev) +	/* +	 * the setting of pages_per_rd_bio is correct for scrub but might +	 * be wrong for the dev_replace code where we might read from +	 * different devices in the initial huge bios. However, that +	 * code is able to correctly handle the case when adding a page +	 * to a bio fails. +	 */ +	if (dev->bdev) +		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, +					 bio_get_nr_vecs(dev->bdev)); +	else +		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; +	sctx = kzalloc(sizeof(*sctx), GFP_NOFS); +	if (!sctx)  		goto nomem; -	sdev->dev = dev; -	sdev->pages_per_bio = pages_per_bio; -	sdev->curr = -1; -	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { +	sctx->is_dev_replace = is_dev_replace; +	sctx->pages_per_rd_bio = pages_per_rd_bio; +	sctx->curr = -1; +	sctx->dev_root = dev->dev_root; +	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {  		struct scrub_bio *sbio;  		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);  		if (!sbio)  			goto nomem; -		sdev->bios[i] = sbio; +		sctx->bios[i] = sbio;  		sbio->index = i; -		sbio->sdev = sdev; +		sbio->sctx = sctx;  		sbio->page_count = 0;  		sbio->work.func = scrub_bio_end_io_worker; -		if (i != SCRUB_BIOS_PER_DEV-1) -			sdev->bios[i]->next_free = i + 1; +		if (i != SCRUB_BIOS_PER_SCTX - 1) +			sctx->bios[i]->next_free = i + 1;  		else -			sdev->bios[i]->next_free = -1; -	} -	sdev->first_free = 0; -	sdev->nodesize = dev->dev_root->nodesize; -	sdev->leafsize = dev->dev_root->leafsize; -	sdev->sectorsize = dev->dev_root->sectorsize; -	atomic_set(&sdev->in_flight, 0); -	atomic_set(&sdev->fixup_cnt, 0); -	atomic_set(&sdev->cancel_req, 0); -	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); -	INIT_LIST_HEAD(&sdev->csum_list); - -	spin_lock_init(&sdev->list_lock); -	spin_lock_init(&sdev->stat_lock); -	init_waitqueue_head(&sdev->list_wait); -	return sdev; +			sctx->bios[i]->next_free = -1; +	} +	sctx->first_free = 0; +	sctx->nodesize = dev->dev_root->nodesize; +	sctx->leafsize = dev->dev_root->leafsize; +	sctx->sectorsize = dev->dev_root->sectorsize; +	atomic_set(&sctx->bios_in_flight, 0); +	atomic_set(&sctx->workers_pending, 0); +	atomic_set(&sctx->cancel_req, 0); +	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); +	INIT_LIST_HEAD(&sctx->csum_list); + +	spin_lock_init(&sctx->list_lock); +	spin_lock_init(&sctx->stat_lock); +	init_waitqueue_head(&sctx->list_wait); + +	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, +				 fs_info->dev_replace.tgtdev, is_dev_replace); +	if (ret) { +		scrub_free_ctx(sctx); +		return ERR_PTR(ret); +	} +	return sctx;  nomem: -	scrub_free_dev(sdev); +	scrub_free_ctx(sctx);  	return ERR_PTR(-ENOMEM);  } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, +				     void *warn_ctx)  {  	u64 isize;  	u32 nlink; @@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)  	int i;  	struct extent_buffer *eb;  	struct btrfs_inode_item *inode_item; -	struct scrub_warning *swarn = ctx; +	struct scrub_warning *swarn = warn_ctx;  	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;  	struct inode_fs_paths *ipath = NULL;  	struct btrfs_root *local_root; @@ -345,8 +496,8 @@ err:  static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  { -	struct btrfs_device *dev = sblock->sdev->dev; -	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; +	struct btrfs_device *dev; +	struct btrfs_fs_info *fs_info;  	struct btrfs_path *path;  	struct btrfs_key found_key;  	struct extent_buffer *eb; @@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	const int bufsize = 4096;  	int ret; +	WARN_ON(sblock->page_count < 1); +	dev = sblock->pagev[0]->dev; +	fs_info = sblock->sctx->dev_root->fs_info; +  	path = btrfs_alloc_path();  	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);  	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); -	BUG_ON(sblock->page_count < 1); -	swarn.sector = (sblock->pagev[0].physical) >> 9; -	swarn.logical = sblock->pagev[0].logical; +	swarn.sector = (sblock->pagev[0]->physical) >> 9; +	swarn.logical = sblock->pagev[0]->logical;  	swarn.errstr = errstr; -	swarn.dev = dev; +	swarn.dev = NULL;  	swarn.msg_bufsize = bufsize;  	swarn.scratch_bufsize = bufsize; @@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  		} while (ret != 1);  	} else {  		swarn.path = path; +		swarn.dev = dev;  		iterate_extent_inodes(fs_info, found_key.objectid,  					extent_item_pos, 1,  					scrub_print_warning_inode, &swarn); @@ -416,11 +571,11 @@ out:  	kfree(swarn.msg_buf);  } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  {  	struct page *page = NULL;  	unsigned long index; -	struct scrub_fixup_nodatasum *fixup = ctx; +	struct scrub_fixup_nodatasum *fixup = fixup_ctx;  	int ret;  	int corrected = 0;  	struct btrfs_key key; @@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)  	}  	if (PageUptodate(page)) { -		struct btrfs_mapping_tree *map_tree; +		struct btrfs_fs_info *fs_info;  		if (PageDirty(page)) {  			/*  			 * we need to write the data to the defect sector. the @@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)  			ret = -EIO;  			goto out;  		} -		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; -		ret = repair_io_failure(map_tree, offset, PAGE_SIZE, +		fs_info = BTRFS_I(inode)->root->fs_info; +		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,  					fixup->logical, page,  					fixup->mirror_num);  		unlock_page(page); @@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)  {  	int ret;  	struct scrub_fixup_nodatasum *fixup; -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	struct btrfs_trans_handle *trans = NULL;  	struct btrfs_fs_info *fs_info;  	struct btrfs_path *path;  	int uncorrectable = 0;  	fixup = container_of(work, struct scrub_fixup_nodatasum, work); -	sdev = fixup->sdev; +	sctx = fixup->sctx;  	fs_info = fixup->root->fs_info;  	path = btrfs_alloc_path();  	if (!path) { -		spin_lock(&sdev->stat_lock); -		++sdev->stat.malloc_errors; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		++sctx->stat.malloc_errors; +		spin_unlock(&sctx->stat_lock);  		uncorrectable = 1;  		goto out;  	} @@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)  	}  	WARN_ON(ret != 1); -	spin_lock(&sdev->stat_lock); -	++sdev->stat.corrected_errors; -	spin_unlock(&sdev->stat_lock); +	spin_lock(&sctx->stat_lock); +	++sctx->stat.corrected_errors; +	spin_unlock(&sctx->stat_lock);  out:  	if (trans && !IS_ERR(trans))  		btrfs_end_transaction(trans, fixup->root);  	if (uncorrectable) { -		spin_lock(&sdev->stat_lock); -		++sdev->stat.uncorrectable_errors; -		spin_unlock(&sdev->stat_lock); - +		spin_lock(&sctx->stat_lock); +		++sctx->stat.uncorrectable_errors; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_replace_stats_inc( +			&sctx->dev_root->fs_info->dev_replace. +			num_uncorrectable_read_errors);  		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",  			(unsigned long long)fixup->logical, -			rcu_str_deref(sdev->dev->name)); +			rcu_str_deref(fixup->dev->name));  	}  	btrfs_free_path(path);  	kfree(fixup); -	/* see caller why we're pretending to be paused in the scrub counters */ -	mutex_lock(&fs_info->scrub_lock); -	atomic_dec(&fs_info->scrubs_running); -	atomic_dec(&fs_info->scrubs_paused); -	mutex_unlock(&fs_info->scrub_lock); -	atomic_dec(&sdev->fixup_cnt); -	wake_up(&fs_info->scrub_pause_wait); -	wake_up(&sdev->list_wait); +	scrub_pending_trans_workers_dec(sctx);  }  /* @@ -614,7 +764,8 @@ out:   */  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  { -	struct scrub_dev *sdev = sblock_to_check->sdev; +	struct scrub_ctx *sctx = sblock_to_check->sctx; +	struct btrfs_device *dev;  	struct btrfs_fs_info *fs_info;  	u64 length;  	u64 logical; @@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				      DEFAULT_RATELIMIT_BURST);  	BUG_ON(sblock_to_check->page_count < 1); -	fs_info = sdev->dev->dev_root->fs_info; +	fs_info = sctx->dev_root->fs_info; +	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { +		/* +		 * if we find an error in a super block, we just report it. +		 * They will get written with the next transaction commit +		 * anyway +		 */ +		spin_lock(&sctx->stat_lock); +		++sctx->stat.super_errors; +		spin_unlock(&sctx->stat_lock); +		return 0; +	}  	length = sblock_to_check->page_count * PAGE_SIZE; -	logical = sblock_to_check->pagev[0].logical; -	generation = sblock_to_check->pagev[0].generation; -	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); -	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; -	is_metadata = !(sblock_to_check->pagev[0].flags & +	logical = sblock_to_check->pagev[0]->logical; +	generation = sblock_to_check->pagev[0]->generation; +	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); +	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; +	is_metadata = !(sblock_to_check->pagev[0]->flags &  			BTRFS_EXTENT_FLAG_DATA); -	have_csum = sblock_to_check->pagev[0].have_csum; -	csum = sblock_to_check->pagev[0].csum; +	have_csum = sblock_to_check->pagev[0]->have_csum; +	csum = sblock_to_check->pagev[0]->csum; +	dev = sblock_to_check->pagev[0]->dev; + +	if (sctx->is_dev_replace && !is_metadata && !have_csum) { +		sblocks_for_recheck = NULL; +		goto nodatasum_case; +	}  	/*  	 * read all mirrors one after the other. This includes to @@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				     sizeof(*sblocks_for_recheck),  				     GFP_NOFS);  	if (!sblocks_for_recheck) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.malloc_errors++; -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		sctx->stat.read_errors++; +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	}  	/* setup the context, map the logical blocks and alloc the pages */ -	ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, +	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,  					logical, sblocks_for_recheck);  	if (ret) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		spin_lock(&sctx->stat_lock); +		sctx->stat.read_errors++; +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	}  	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);  	sblock_bad = sblocks_for_recheck + failed_mirror_index;  	/* build and submit the bios for the failed mirror, check checksums */ -	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, -				  csum, generation, sdev->csum_size); -	if (ret) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); -		goto out; -	} +	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, +			    csum, generation, sctx->csum_size);  	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&  	    sblock_bad->no_io_error_seen) { @@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		 * different bio (usually one of the two latter cases is  		 * the cause)  		 */ -		spin_lock(&sdev->stat_lock); -		sdev->stat.unverified_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.unverified_errors++; +		spin_unlock(&sctx->stat_lock); +		if (sctx->is_dev_replace) +			scrub_write_block_to_dev_replace(sblock_bad);  		goto out;  	}  	if (!sblock_bad->no_io_error_seen) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.read_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("i/o error", sblock_to_check); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  	} else if (sblock_bad->checksum_error) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.csum_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.csum_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum error", sblock_to_check); -		btrfs_dev_stat_inc_and_print(sdev->dev, +		btrfs_dev_stat_inc_and_print(dev,  					     BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} else if (sblock_bad->header_error) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.verify_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.verify_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum/header error",  					    sblock_to_check);  		if (sblock_bad->generation_error) -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(dev,  				BTRFS_DEV_STAT_GENERATION_ERRS);  		else -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(dev,  				BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} -	if (sdev->readonly) +	if (sctx->readonly && !sctx->is_dev_replace)  		goto did_not_correct_error;  	if (!is_metadata && !have_csum) {  		struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: +		WARN_ON(sctx->is_dev_replace); +  		/*  		 * !is_metadata and !have_csum, this means that the data  		 * might not be COW'ed, that it might be modified @@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);  		if (!fixup_nodatasum)  			goto did_not_correct_error; -		fixup_nodatasum->sdev = sdev; +		fixup_nodatasum->sctx = sctx; +		fixup_nodatasum->dev = dev;  		fixup_nodatasum->logical = logical;  		fixup_nodatasum->root = fs_info->extent_root;  		fixup_nodatasum->mirror_num = failed_mirror_index + 1; -		/* -		 * increment scrubs_running to prevent cancel requests from -		 * completing as long as a fixup worker is running. we must also -		 * increment scrubs_paused to prevent deadlocking on pause -		 * requests used for transactions commits (as the worker uses a -		 * transaction context). it is safe to regard the fixup worker -		 * as paused for all matters practical. effectively, we only -		 * avoid cancellation requests from completing. -		 */ -		mutex_lock(&fs_info->scrub_lock); -		atomic_inc(&fs_info->scrubs_running); -		atomic_inc(&fs_info->scrubs_paused); -		mutex_unlock(&fs_info->scrub_lock); -		atomic_inc(&sdev->fixup_cnt); +		scrub_pending_trans_workers_inc(sctx);  		fixup_nodatasum->work.func = scrub_fixup_nodatasum;  		btrfs_queue_worker(&fs_info->scrub_workers,  				   &fixup_nodatasum->work); @@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	/*  	 * now build and submit the bios for the other mirrors, check -	 * checksums -	 */ -	for (mirror_index = 0; -	     mirror_index < BTRFS_MAX_MIRRORS && -	     sblocks_for_recheck[mirror_index].page_count > 0; -	     mirror_index++) { -		if (mirror_index == failed_mirror_index) -			continue; - -		/* build and submit the bios, check checksums */ -		ret = scrub_recheck_block(fs_info, -					  sblocks_for_recheck + mirror_index, -					  is_metadata, have_csum, csum, -					  generation, sdev->csum_size); -		if (ret) -			goto did_not_correct_error; -	} - -	/* -	 * first try to pick the mirror which is completely without I/O +	 * checksums. +	 * First try to pick the mirror which is completely without I/O  	 * errors and also does not have a checksum error.  	 * If one is found, and if a checksum is present, the full block  	 * that is known to contain an error is rewritten. Afterwards @@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	     mirror_index < BTRFS_MAX_MIRRORS &&  	     sblocks_for_recheck[mirror_index].page_count > 0;  	     mirror_index++) { -		struct scrub_block *sblock_other = sblocks_for_recheck + -						   mirror_index; +		struct scrub_block *sblock_other; + +		if (mirror_index == failed_mirror_index) +			continue; +		sblock_other = sblocks_for_recheck + mirror_index; + +		/* build and submit the bios, check checksums */ +		scrub_recheck_block(fs_info, sblock_other, is_metadata, +				    have_csum, csum, generation, +				    sctx->csum_size);  		if (!sblock_other->header_error &&  		    !sblock_other->checksum_error &&  		    sblock_other->no_io_error_seen) { -			int force_write = is_metadata || have_csum; - -			ret = scrub_repair_block_from_good_copy(sblock_bad, -								sblock_other, -								force_write); +			if (sctx->is_dev_replace) { +				scrub_write_block_to_dev_replace(sblock_other); +			} else { +				int force_write = is_metadata || have_csum; + +				ret = scrub_repair_block_from_good_copy( +						sblock_bad, sblock_other, +						force_write); +			}  			if (0 == ret)  				goto corrected_error;  		}  	}  	/* -	 * in case of I/O errors in the area that is supposed to be +	 * for dev_replace, pick good pages and write to the target device. +	 */ +	if (sctx->is_dev_replace) { +		success = 1; +		for (page_num = 0; page_num < sblock_bad->page_count; +		     page_num++) { +			int sub_success; + +			sub_success = 0; +			for (mirror_index = 0; +			     mirror_index < BTRFS_MAX_MIRRORS && +			     sblocks_for_recheck[mirror_index].page_count > 0; +			     mirror_index++) { +				struct scrub_block *sblock_other = +					sblocks_for_recheck + mirror_index; +				struct scrub_page *page_other = +					sblock_other->pagev[page_num]; + +				if (!page_other->io_error) { +					ret = scrub_write_page_to_dev_replace( +							sblock_other, page_num); +					if (ret == 0) { +						/* succeeded for this page */ +						sub_success = 1; +						break; +					} else { +						btrfs_dev_replace_stats_inc( +							&sctx->dev_root-> +							fs_info->dev_replace. +							num_write_errors); +					} +				} +			} + +			if (!sub_success) { +				/* +				 * did not find a mirror to fetch the page +				 * from. scrub_write_page_to_dev_replace() +				 * handles this case (page->io_error), by +				 * filling the block with zeros before +				 * submitting the write request +				 */ +				success = 0; +				ret = scrub_write_page_to_dev_replace( +						sblock_bad, page_num); +				if (ret) +					btrfs_dev_replace_stats_inc( +						&sctx->dev_root->fs_info-> +						dev_replace.num_write_errors); +			} +		} + +		goto out; +	} + +	/* +	 * for regular scrub, repair those pages that are errored. +	 * In case of I/O errors in the area that is supposed to be  	 * repaired, continue by picking good copies of those pages.  	 * Select the good pages from mirrors to rewrite bad pages from  	 * the area to fix. Afterwards verify the checksum of the block @@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	success = 1;  	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { -		struct scrub_page *page_bad = sblock_bad->pagev + page_num; +		struct scrub_page *page_bad = sblock_bad->pagev[page_num];  		if (!page_bad->io_error)  			continue; @@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		     mirror_index++) {  			struct scrub_block *sblock_other = sblocks_for_recheck +  							   mirror_index; -			struct scrub_page *page_other = sblock_other->pagev + -							page_num; +			struct scrub_page *page_other = sblock_other->pagev[ +							page_num];  			if (!page_other->io_error) {  				ret = scrub_repair_page_from_good_copy( @@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  			 * is verified, but most likely the data comes out  			 * of the page cache.  			 */ -			ret = scrub_recheck_block(fs_info, sblock_bad, -						  is_metadata, have_csum, csum, -						  generation, sdev->csum_size); -			if (!ret && !sblock_bad->header_error && +			scrub_recheck_block(fs_info, sblock_bad, +					    is_metadata, have_csum, csum, +					    generation, sctx->csum_size); +			if (!sblock_bad->header_error &&  			    !sblock_bad->checksum_error &&  			    sblock_bad->no_io_error_seen)  				goto corrected_error; @@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				goto did_not_correct_error;  		} else {  corrected_error: -			spin_lock(&sdev->stat_lock); -			sdev->stat.corrected_errors++; -			spin_unlock(&sdev->stat_lock); +			spin_lock(&sctx->stat_lock); +			sctx->stat.corrected_errors++; +			spin_unlock(&sctx->stat_lock);  			printk_ratelimited_in_rcu(KERN_ERR  				"btrfs: fixed up error at logical %llu on dev %s\n",  				(unsigned long long)logical, -				rcu_str_deref(sdev->dev->name)); +				rcu_str_deref(dev->name));  		}  	} else {  did_not_correct_error: -		spin_lock(&sdev->stat_lock); -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock);  		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",  			(unsigned long long)logical, -			rcu_str_deref(sdev->dev->name)); +			rcu_str_deref(dev->name));  	}  out: @@ -966,11 +1166,11 @@ out:  						     mirror_index;  			int page_index; -			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; -			     page_index++) -				if (sblock->pagev[page_index].page) -					__free_page( -						sblock->pagev[page_index].page); +			for (page_index = 0; page_index < sblock->page_count; +			     page_index++) { +				sblock->pagev[page_index]->sblock = NULL; +				scrub_page_put(sblock->pagev[page_index]); +			}  		}  		kfree(sblocks_for_recheck);  	} @@ -978,8 +1178,9 @@ out:  	return 0;  } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, -				     struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, +				     struct btrfs_fs_info *fs_info, +				     struct scrub_block *original_sblock,  				     u64 length, u64 logical,  				     struct scrub_block *sblocks_for_recheck)  { @@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  	int ret;  	/* -	 * note: the three members sdev, ref_count and outstanding_pages +	 * note: the two members ref_count and outstanding_pages  	 * are not used (and not set) in the blocks that are used for  	 * the recheck procedure  	 */ @@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  		 * with a length of PAGE_SIZE, each returned stripe  		 * represents one mirror  		 */ -		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, -				      &bbio, 0); +		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, +				      &mapped_length, &bbio, 0);  		if (ret || !bbio || mapped_length < sublen) {  			kfree(bbio);  			return -EIO;  		} -		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); +		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);  		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;  		     mirror_index++) {  			struct scrub_block *sblock; @@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  				continue;  			sblock = sblocks_for_recheck + mirror_index; -			page = sblock->pagev + page_index; +			sblock->sctx = sctx; +			page = kzalloc(sizeof(*page), GFP_NOFS); +			if (!page) { +leave_nomem: +				spin_lock(&sctx->stat_lock); +				sctx->stat.malloc_errors++; +				spin_unlock(&sctx->stat_lock); +				kfree(bbio); +				return -ENOMEM; +			} +			scrub_page_get(page); +			sblock->pagev[page_index] = page;  			page->logical = logical;  			page->physical = bbio->stripes[mirror_index].physical; +			BUG_ON(page_index >= original_sblock->page_count); +			page->physical_for_dev_replace = +				original_sblock->pagev[page_index]-> +				physical_for_dev_replace;  			/* for missing devices, dev->bdev is NULL */  			page->dev = bbio->stripes[mirror_index].dev;  			page->mirror_num = mirror_index + 1; -			page->page = alloc_page(GFP_NOFS); -			if (!page->page) { -				spin_lock(&sdev->stat_lock); -				sdev->stat.malloc_errors++; -				spin_unlock(&sdev->stat_lock); -				kfree(bbio); -				return -ENOMEM; -			}  			sblock->page_count++; +			page->page = alloc_page(GFP_NOFS); +			if (!page->page) +				goto leave_nomem;  		}  		kfree(bbio);  		length -= sublen; @@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,   * to take those pages that are not errored from all the mirrors so that   * the pages that are errored in the just handled mirror can be repaired.   */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, -			       struct scrub_block *sblock, int is_metadata, -			       int have_csum, u8 *csum, u64 generation, -			       u16 csum_size) +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +				struct scrub_block *sblock, int is_metadata, +				int have_csum, u8 *csum, u64 generation, +				u16 csum_size)  {  	int page_num; @@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  	for (page_num = 0; page_num < sblock->page_count; page_num++) {  		struct bio *bio; -		int ret; -		struct scrub_page *page = sblock->pagev + page_num; +		struct scrub_page *page = sblock->pagev[page_num];  		DECLARE_COMPLETION_ONSTACK(complete);  		if (page->dev->bdev == NULL) { @@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  			continue;  		} -		BUG_ON(!page->page); +		WARN_ON(!page->page);  		bio = bio_alloc(GFP_NOFS, 1); -		if (!bio) -			return -EIO; +		if (!bio) { +			page->io_error = 1; +			sblock->no_io_error_seen = 0; +			continue; +		}  		bio->bi_bdev = page->dev->bdev;  		bio->bi_sector = page->physical >> 9;  		bio->bi_end_io = scrub_complete_bio_end_io;  		bio->bi_private = &complete; -		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); -		if (PAGE_SIZE != ret) { -			bio_put(bio); -			return -EIO; -		} +		bio_add_page(bio, page->page, PAGE_SIZE, 0);  		btrfsic_submit_bio(READ, bio);  		/* this will also unplug the queue */ @@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  					     have_csum, csum, generation,  					     csum_size); -	return 0; +	return;  }  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, @@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  	struct btrfs_root *root = fs_info->extent_root;  	void *mapped_buffer; -	BUG_ON(!sblock->pagev[0].page); +	WARN_ON(!sblock->pagev[0]->page);  	if (is_metadata) {  		struct btrfs_header *h; -		mapped_buffer = kmap_atomic(sblock->pagev[0].page); +		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);  		h = (struct btrfs_header *)mapped_buffer; -		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || +		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||  		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||  		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,  			   BTRFS_UUID_SIZE)) { @@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		if (!have_csum)  			return; -		mapped_buffer = kmap_atomic(sblock->pagev[0].page); +		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);  	}  	for (page_num = 0;;) { @@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		page_num++;  		if (page_num >= sblock->page_count)  			break; -		BUG_ON(!sblock->pagev[page_num].page); +		WARN_ON(!sblock->pagev[page_num]->page); -		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); +		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);  	}  	btrfs_csum_final(crc, calculated_csum); @@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  					    struct scrub_block *sblock_good,  					    int page_num, int force_write)  { -	struct scrub_page *page_bad = sblock_bad->pagev + page_num; -	struct scrub_page *page_good = sblock_good->pagev + page_num; +	struct scrub_page *page_bad = sblock_bad->pagev[page_num]; +	struct scrub_page *page_good = sblock_good->pagev[page_num]; -	BUG_ON(sblock_bad->pagev[page_num].page == NULL); -	BUG_ON(sblock_good->pagev[page_num].page == NULL); +	BUG_ON(page_bad->page == NULL); +	BUG_ON(page_good->page == NULL);  	if (force_write || sblock_bad->header_error ||  	    sblock_bad->checksum_error || page_bad->io_error) {  		struct bio *bio;  		int ret;  		DECLARE_COMPLETION_ONSTACK(complete); +		if (!page_bad->dev->bdev) { +			printk_ratelimited(KERN_WARNING +				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); +			return -EIO; +		} +  		bio = bio_alloc(GFP_NOFS, 1);  		if (!bio)  			return -EIO; @@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  		if (!bio_flagged(bio, BIO_UPTODATE)) {  			btrfs_dev_stat_inc_and_print(page_bad->dev,  				BTRFS_DEV_STAT_WRITE_ERRS); +			btrfs_dev_replace_stats_inc( +				&sblock_bad->sctx->dev_root->fs_info-> +				dev_replace.num_write_errors);  			bio_put(bio);  			return -EIO;  		} @@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  	return 0;  } -static void scrub_checksum(struct scrub_block *sblock) +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ +	int page_num; + +	for (page_num = 0; page_num < sblock->page_count; page_num++) { +		int ret; + +		ret = scrub_write_page_to_dev_replace(sblock, page_num); +		if (ret) +			btrfs_dev_replace_stats_inc( +				&sblock->sctx->dev_root->fs_info->dev_replace. +				num_write_errors); +	} +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, +					   int page_num) +{ +	struct scrub_page *spage = sblock->pagev[page_num]; + +	BUG_ON(spage->page == NULL); +	if (spage->io_error) { +		void *mapped_buffer = kmap_atomic(spage->page); + +		memset(mapped_buffer, 0, PAGE_CACHE_SIZE); +		flush_dcache_page(spage->page); +		kunmap_atomic(mapped_buffer); +	} +	return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage) +{ +	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; +	struct scrub_bio *sbio; +	int ret; + +	mutex_lock(&wr_ctx->wr_lock); +again: +	if (!wr_ctx->wr_curr_bio) { +		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), +					      GFP_NOFS); +		if (!wr_ctx->wr_curr_bio) { +			mutex_unlock(&wr_ctx->wr_lock); +			return -ENOMEM; +		} +		wr_ctx->wr_curr_bio->sctx = sctx; +		wr_ctx->wr_curr_bio->page_count = 0; +	} +	sbio = wr_ctx->wr_curr_bio; +	if (sbio->page_count == 0) { +		struct bio *bio; + +		sbio->physical = spage->physical_for_dev_replace; +		sbio->logical = spage->logical; +		sbio->dev = wr_ctx->tgtdev; +		bio = sbio->bio; +		if (!bio) { +			bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); +			if (!bio) { +				mutex_unlock(&wr_ctx->wr_lock); +				return -ENOMEM; +			} +			sbio->bio = bio; +		} + +		bio->bi_private = sbio; +		bio->bi_end_io = scrub_wr_bio_end_io; +		bio->bi_bdev = sbio->dev->bdev; +		bio->bi_sector = sbio->physical >> 9; +		sbio->err = 0; +	} else if (sbio->physical + sbio->page_count * PAGE_SIZE != +		   spage->physical_for_dev_replace || +		   sbio->logical + sbio->page_count * PAGE_SIZE != +		   spage->logical) { +		scrub_wr_submit(sctx); +		goto again; +	} + +	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); +	if (ret != PAGE_SIZE) { +		if (sbio->page_count < 1) { +			bio_put(sbio->bio); +			sbio->bio = NULL; +			mutex_unlock(&wr_ctx->wr_lock); +			return -EIO; +		} +		scrub_wr_submit(sctx); +		goto again; +	} + +	sbio->pagev[sbio->page_count] = spage; +	scrub_page_get(spage); +	sbio->page_count++; +	if (sbio->page_count == wr_ctx->pages_per_wr_bio) +		scrub_wr_submit(sctx); +	mutex_unlock(&wr_ctx->wr_lock); + +	return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ +	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; +	struct scrub_bio *sbio; + +	if (!wr_ctx->wr_curr_bio) +		return; + +	sbio = wr_ctx->wr_curr_bio; +	wr_ctx->wr_curr_bio = NULL; +	WARN_ON(!sbio->bio->bi_bdev); +	scrub_pending_bio_inc(sctx); +	/* process all writes in a single worker thread. Then the block layer +	 * orders the requests before sending them to the driver which +	 * doubled the write performance on spinning disks when measured +	 * with Linux 3.5 */ +	btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ +	struct scrub_bio *sbio = bio->bi_private; +	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + +	sbio->err = err; +	sbio->bio = bio; + +	sbio->work.func = scrub_wr_bio_end_io_worker; +	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ +	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); +	struct scrub_ctx *sctx = sbio->sctx; +	int i; + +	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); +	if (sbio->err) { +		struct btrfs_dev_replace *dev_replace = +			&sbio->sctx->dev_root->fs_info->dev_replace; + +		for (i = 0; i < sbio->page_count; i++) { +			struct scrub_page *spage = sbio->pagev[i]; + +			spage->io_error = 1; +			btrfs_dev_replace_stats_inc(&dev_replace-> +						    num_write_errors); +		} +	} + +	for (i = 0; i < sbio->page_count; i++) +		scrub_page_put(sbio->pagev[i]); + +	bio_put(sbio->bio); +	kfree(sbio); +	scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock)  {  	u64 flags;  	int ret; -	BUG_ON(sblock->page_count < 1); -	flags = sblock->pagev[0].flags; +	WARN_ON(sblock->page_count < 1); +	flags = sblock->pagev[0]->flags;  	ret = 0;  	if (flags & BTRFS_EXTENT_FLAG_DATA)  		ret = scrub_checksum_data(sblock); @@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)  		WARN_ON(1);  	if (ret)  		scrub_handle_errored_block(sblock); + +	return ret;  }  static int scrub_checksum_data(struct scrub_block *sblock)  { -	struct scrub_dev *sdev = sblock->sdev; +	struct scrub_ctx *sctx = sblock->sctx;  	u8 csum[BTRFS_CSUM_SIZE];  	u8 *on_disk_csum;  	struct page *page;  	void *buffer;  	u32 crc = ~(u32)0;  	int fail = 0; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	u64 len;  	int index;  	BUG_ON(sblock->page_count < 1); -	if (!sblock->pagev[0].have_csum) +	if (!sblock->pagev[0]->have_csum)  		return 0; -	on_disk_csum = sblock->pagev[0].csum; -	page = sblock->pagev[0].page; +	on_disk_csum = sblock->pagev[0]->csum; +	page = sblock->pagev[0]->page;  	buffer = kmap_atomic(page); -	len = sdev->sectorsize; +	len = sctx->sectorsize;  	index = 0;  	for (;;) {  		u64 l = min_t(u64, len, PAGE_SIZE); @@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		buffer = kmap_atomic(page);  	}  	btrfs_csum_final(crc, csum); -	if (memcmp(csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(csum, on_disk_csum, sctx->csum_size))  		fail = 1;  	return fail; @@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)  static int scrub_checksum_tree_block(struct scrub_block *sblock)  { -	struct scrub_dev *sdev = sblock->sdev; +	struct scrub_ctx *sctx = sblock->sctx;  	struct btrfs_header *h; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u8 calculated_csum[BTRFS_CSUM_SIZE];  	u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	int index;  	BUG_ON(sblock->page_count < 1); -	page = sblock->pagev[0].page; +	page = sblock->pagev[0]->page;  	mapped_buffer = kmap_atomic(page);  	h = (struct btrfs_header *)mapped_buffer; -	memcpy(on_disk_csum, h->csum, sdev->csum_size); +	memcpy(on_disk_csum, h->csum, sctx->csum_size);  	/*  	 * we don't use the getter functions here, as we @@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	 * b) the page is already kmapped  	 */ -	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) +	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))  		++fail; -	if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) +	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))  		++fail;  	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  		   BTRFS_UUID_SIZE))  		++fail; -	BUG_ON(sdev->nodesize != sdev->leafsize); -	len = sdev->nodesize - BTRFS_CSUM_SIZE; +	WARN_ON(sctx->nodesize != sctx->leafsize); +	len = sctx->nodesize - BTRFS_CSUM_SIZE;  	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;  	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;  	index = 0; @@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		mapped_buffer = kmap_atomic(page);  		mapped_size = PAGE_SIZE;  		p = mapped_buffer;  	}  	btrfs_csum_final(crc, calculated_csum); -	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))  		++crc_fail;  	return fail || crc_fail; @@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  static int scrub_checksum_super(struct scrub_block *sblock)  {  	struct btrfs_super_block *s; -	struct scrub_dev *sdev = sblock->sdev; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct scrub_ctx *sctx = sblock->sctx; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u8 calculated_csum[BTRFS_CSUM_SIZE];  	u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	int index;  	BUG_ON(sblock->page_count < 1); -	page = sblock->pagev[0].page; +	page = sblock->pagev[0]->page;  	mapped_buffer = kmap_atomic(page);  	s = (struct btrfs_super_block *)mapped_buffer; -	memcpy(on_disk_csum, s->csum, sdev->csum_size); +	memcpy(on_disk_csum, s->csum, sctx->csum_size); -	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) +	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))  		++fail_cor; -	if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) +	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))  		++fail_gen;  	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		mapped_buffer = kmap_atomic(page);  		mapped_size = PAGE_SIZE;  		p = mapped_buffer;  	}  	btrfs_csum_final(crc, calculated_csum); -	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))  		++fail_cor;  	if (fail_cor + fail_gen) { @@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)  		 * They will get written with the next transaction commit  		 * anyway  		 */ -		spin_lock(&sdev->stat_lock); -		++sdev->stat.super_errors; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		++sctx->stat.super_errors; +		spin_unlock(&sctx->stat_lock);  		if (fail_cor) -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,  				BTRFS_DEV_STAT_CORRUPTION_ERRS);  		else -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,  				BTRFS_DEV_STAT_GENERATION_ERRS);  	} @@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)  		int i;  		for (i = 0; i < sblock->page_count; i++) -			if (sblock->pagev[i].page) -				__free_page(sblock->pagev[i].page); +			scrub_page_put(sblock->pagev[i]);  		kfree(sblock);  	}  } -static void scrub_submit(struct scrub_dev *sdev) +static void scrub_page_get(struct scrub_page *spage) +{ +	atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ +	if (atomic_dec_and_test(&spage->ref_count)) { +		if (spage->page) +			__free_page(spage->page); +		kfree(spage); +	} +} + +static void scrub_submit(struct scrub_ctx *sctx)  {  	struct scrub_bio *sbio; -	if (sdev->curr == -1) +	if (sctx->curr == -1)  		return; -	sbio = sdev->bios[sdev->curr]; -	sdev->curr = -1; -	atomic_inc(&sdev->in_flight); +	sbio = sctx->bios[sctx->curr]; +	sctx->curr = -1; +	scrub_pending_bio_inc(sctx); -	btrfsic_submit_bio(READ, sbio->bio); +	if (!sbio->bio->bi_bdev) { +		/* +		 * this case should not happen. If btrfs_map_block() is +		 * wrong, it could happen for dev-replace operations on +		 * missing devices when no mirrors are available, but in +		 * this case it should already fail the mount. +		 * This case is handled correctly (but _very_ slowly). +		 */ +		printk_ratelimited(KERN_WARNING +			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); +		bio_endio(sbio->bio, -EIO); +	} else { +		btrfsic_submit_bio(READ, sbio->bio); +	}  } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, -				 struct scrub_page *spage) +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage)  {  	struct scrub_block *sblock = spage->sblock;  	struct scrub_bio *sbio; @@ -1494,28 +1901,29 @@ again:  	/*  	 * grab a fresh bio or wait for one to become available  	 */ -	while (sdev->curr == -1) { -		spin_lock(&sdev->list_lock); -		sdev->curr = sdev->first_free; -		if (sdev->curr != -1) { -			sdev->first_free = sdev->bios[sdev->curr]->next_free; -			sdev->bios[sdev->curr]->next_free = -1; -			sdev->bios[sdev->curr]->page_count = 0; -			spin_unlock(&sdev->list_lock); +	while (sctx->curr == -1) { +		spin_lock(&sctx->list_lock); +		sctx->curr = sctx->first_free; +		if (sctx->curr != -1) { +			sctx->first_free = sctx->bios[sctx->curr]->next_free; +			sctx->bios[sctx->curr]->next_free = -1; +			sctx->bios[sctx->curr]->page_count = 0; +			spin_unlock(&sctx->list_lock);  		} else { -			spin_unlock(&sdev->list_lock); -			wait_event(sdev->list_wait, sdev->first_free != -1); +			spin_unlock(&sctx->list_lock); +			wait_event(sctx->list_wait, sctx->first_free != -1);  		}  	} -	sbio = sdev->bios[sdev->curr]; +	sbio = sctx->bios[sctx->curr];  	if (sbio->page_count == 0) {  		struct bio *bio;  		sbio->physical = spage->physical;  		sbio->logical = spage->logical; +		sbio->dev = spage->dev;  		bio = sbio->bio;  		if (!bio) { -			bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); +			bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);  			if (!bio)  				return -ENOMEM;  			sbio->bio = bio; @@ -1523,14 +1931,15 @@ again:  		bio->bi_private = sbio;  		bio->bi_end_io = scrub_bio_end_io; -		bio->bi_bdev = sdev->dev->bdev; -		bio->bi_sector = spage->physical >> 9; +		bio->bi_bdev = sbio->dev->bdev; +		bio->bi_sector = sbio->physical >> 9;  		sbio->err = 0;  	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=  		   spage->physical ||  		   sbio->logical + sbio->page_count * PAGE_SIZE != -		   spage->logical) { -		scrub_submit(sdev); +		   spage->logical || +		   sbio->dev != spage->dev) { +		scrub_submit(sctx);  		goto again;  	} @@ -1542,81 +1951,87 @@ again:  			sbio->bio = NULL;  			return -EIO;  		} -		scrub_submit(sdev); +		scrub_submit(sctx);  		goto again;  	} -	scrub_block_get(sblock); /* one for the added page */ +	scrub_block_get(sblock); /* one for the page added to the bio */  	atomic_inc(&sblock->outstanding_pages);  	sbio->page_count++; -	if (sbio->page_count == sdev->pages_per_bio) -		scrub_submit(sdev); +	if (sbio->page_count == sctx->pages_per_rd_bio) +		scrub_submit(sctx);  	return 0;  } -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, -		       u64 physical, u64 flags, u64 gen, int mirror_num, -		       u8 *csum, int force) +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +		       u64 physical, struct btrfs_device *dev, u64 flags, +		       u64 gen, int mirror_num, u8 *csum, int force, +		       u64 physical_for_dev_replace)  {  	struct scrub_block *sblock;  	int index;  	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);  	if (!sblock) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.malloc_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock);  		return -ENOMEM;  	} -	/* one ref inside this function, plus one for each page later on */ +	/* one ref inside this function, plus one for each page added to +	 * a bio later on */  	atomic_set(&sblock->ref_count, 1); -	sblock->sdev = sdev; +	sblock->sctx = sctx;  	sblock->no_io_error_seen = 1;  	for (index = 0; len > 0; index++) { -		struct scrub_page *spage = sblock->pagev + index; +		struct scrub_page *spage;  		u64 l = min_t(u64, len, PAGE_SIZE); -		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); -		spage->page = alloc_page(GFP_NOFS); -		if (!spage->page) { -			spin_lock(&sdev->stat_lock); -			sdev->stat.malloc_errors++; -			spin_unlock(&sdev->stat_lock); -			while (index > 0) { -				index--; -				__free_page(sblock->pagev[index].page); -			} -			kfree(sblock); +		spage = kzalloc(sizeof(*spage), GFP_NOFS); +		if (!spage) { +leave_nomem: +			spin_lock(&sctx->stat_lock); +			sctx->stat.malloc_errors++; +			spin_unlock(&sctx->stat_lock); +			scrub_block_put(sblock);  			return -ENOMEM;  		} +		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); +		scrub_page_get(spage); +		sblock->pagev[index] = spage;  		spage->sblock = sblock; -		spage->dev = sdev->dev; +		spage->dev = dev;  		spage->flags = flags;  		spage->generation = gen;  		spage->logical = logical;  		spage->physical = physical; +		spage->physical_for_dev_replace = physical_for_dev_replace;  		spage->mirror_num = mirror_num;  		if (csum) {  			spage->have_csum = 1; -			memcpy(spage->csum, csum, sdev->csum_size); +			memcpy(spage->csum, csum, sctx->csum_size);  		} else {  			spage->have_csum = 0;  		}  		sblock->page_count++; +		spage->page = alloc_page(GFP_NOFS); +		if (!spage->page) +			goto leave_nomem;  		len -= l;  		logical += l;  		physical += l; +		physical_for_dev_replace += l;  	} -	BUG_ON(sblock->page_count == 0); +	WARN_ON(sblock->page_count == 0);  	for (index = 0; index < sblock->page_count; index++) { -		struct scrub_page *spage = sblock->pagev + index; +		struct scrub_page *spage = sblock->pagev[index];  		int ret; -		ret = scrub_add_page_to_bio(sdev, spage); +		ret = scrub_add_page_to_rd_bio(sctx, spage);  		if (ret) {  			scrub_block_put(sblock);  			return ret; @@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,  	}  	if (force) -		scrub_submit(sdev); +		scrub_submit(sctx);  	/* last one frees, either here or in bio completion for last page */  	scrub_block_put(sblock); @@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,  static void scrub_bio_end_io(struct bio *bio, int err)  {  	struct scrub_bio *sbio = bio->bi_private; -	struct scrub_dev *sdev = sbio->sdev; -	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; +	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;  	sbio->err = err;  	sbio->bio = bio; @@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)  static void scrub_bio_end_io_worker(struct btrfs_work *work)  {  	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); -	struct scrub_dev *sdev = sbio->sdev; +	struct scrub_ctx *sctx = sbio->sctx;  	int i; -	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); +	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);  	if (sbio->err) {  		for (i = 0; i < sbio->page_count; i++) {  			struct scrub_page *spage = sbio->pagev[i]; @@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)  	bio_put(sbio->bio);  	sbio->bio = NULL; -	spin_lock(&sdev->list_lock); -	sbio->next_free = sdev->first_free; -	sdev->first_free = sbio->index; -	spin_unlock(&sdev->list_lock); -	atomic_dec(&sdev->in_flight); -	wake_up(&sdev->list_wait); +	spin_lock(&sctx->list_lock); +	sbio->next_free = sctx->first_free; +	sctx->first_free = sbio->index; +	spin_unlock(&sctx->list_lock); + +	if (sctx->is_dev_replace && +	    atomic_read(&sctx->wr_ctx.flush_all_writes)) { +		mutex_lock(&sctx->wr_ctx.wr_lock); +		scrub_wr_submit(sctx); +		mutex_unlock(&sctx->wr_ctx.wr_lock); +	} + +	scrub_pending_bio_dec(sctx);  }  static void scrub_block_complete(struct scrub_block *sblock)  { -	if (!sblock->no_io_error_seen) +	if (!sblock->no_io_error_seen) {  		scrub_handle_errored_block(sblock); -	else -		scrub_checksum(sblock); +	} else { +		/* +		 * if has checksum error, write via repair mechanism in +		 * dev replace case, otherwise write here in dev replace +		 * case. +		 */ +		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) +			scrub_write_block_to_dev_replace(sblock); +	}  } -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,  			   u8 *csum)  {  	struct btrfs_ordered_sum *sum = NULL; @@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  	unsigned long i;  	unsigned long num_sectors; -	while (!list_empty(&sdev->csum_list)) { -		sum = list_first_entry(&sdev->csum_list, +	while (!list_empty(&sctx->csum_list)) { +		sum = list_first_entry(&sctx->csum_list,  				       struct btrfs_ordered_sum, list);  		if (sum->bytenr > logical)  			return 0;  		if (sum->bytenr + sum->len > logical)  			break; -		++sdev->stat.csum_discards; +		++sctx->stat.csum_discards;  		list_del(&sum->list);  		kfree(sum);  		sum = NULL; @@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  	if (!sum)  		return 0; -	num_sectors = sum->len / sdev->sectorsize; +	num_sectors = sum->len / sctx->sectorsize;  	for (i = 0; i < num_sectors; ++i) {  		if (sum->sums[i].bytenr == logical) { -			memcpy(csum, &sum->sums[i].sum, sdev->csum_size); +			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);  			ret = 1;  			break;  		} @@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  }  /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, -			u64 physical, u64 flags, u64 gen, int mirror_num) +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, +			u64 physical, struct btrfs_device *dev, u64 flags, +			u64 gen, int mirror_num, u64 physical_for_dev_replace)  {  	int ret;  	u8 csum[BTRFS_CSUM_SIZE];  	u32 blocksize;  	if (flags & BTRFS_EXTENT_FLAG_DATA) { -		blocksize = sdev->sectorsize; -		spin_lock(&sdev->stat_lock); -		sdev->stat.data_extents_scrubbed++; -		sdev->stat.data_bytes_scrubbed += len; -		spin_unlock(&sdev->stat_lock); +		blocksize = sctx->sectorsize; +		spin_lock(&sctx->stat_lock); +		sctx->stat.data_extents_scrubbed++; +		sctx->stat.data_bytes_scrubbed += len; +		spin_unlock(&sctx->stat_lock);  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { -		BUG_ON(sdev->nodesize != sdev->leafsize); -		blocksize = sdev->nodesize; -		spin_lock(&sdev->stat_lock); -		sdev->stat.tree_extents_scrubbed++; -		sdev->stat.tree_bytes_scrubbed += len; -		spin_unlock(&sdev->stat_lock); +		WARN_ON(sctx->nodesize != sctx->leafsize); +		blocksize = sctx->nodesize; +		spin_lock(&sctx->stat_lock); +		sctx->stat.tree_extents_scrubbed++; +		sctx->stat.tree_bytes_scrubbed += len; +		spin_unlock(&sctx->stat_lock);  	} else { -		blocksize = sdev->sectorsize; -		BUG_ON(1); +		blocksize = sctx->sectorsize; +		WARN_ON(1);  	}  	while (len) { @@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,  		if (flags & BTRFS_EXTENT_FLAG_DATA) {  			/* push csums to sbio */ -			have_csum = scrub_find_csum(sdev, logical, l, csum); +			have_csum = scrub_find_csum(sctx, logical, l, csum);  			if (have_csum == 0) -				++sdev->stat.no_csum; +				++sctx->stat.no_csum; +			if (sctx->is_dev_replace && !have_csum) { +				ret = copy_nocow_pages(sctx, logical, l, +						       mirror_num, +						      physical_for_dev_replace); +				goto behind_scrub_pages; +			}  		} -		ret = scrub_pages(sdev, logical, l, physical, flags, gen, -				  mirror_num, have_csum ? csum : NULL, 0); +		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, +				  mirror_num, have_csum ? csum : NULL, 0, +				  physical_for_dev_replace); +behind_scrub_pages:  		if (ret)  			return ret;  		len -= l;  		logical += l;  		physical += l; +		physical_for_dev_replace += l;  	}  	return 0;  } -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, -	struct map_lookup *map, int num, u64 base, u64 length) +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, +					   struct map_lookup *map, +					   struct btrfs_device *scrub_dev, +					   int num, u64 base, u64 length, +					   int is_dev_replace)  {  	struct btrfs_path *path; -	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;  	struct btrfs_root *root = fs_info->extent_root;  	struct btrfs_root *csum_root = fs_info->csum_root;  	struct btrfs_extent_item *extent; @@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	struct reada_control *reada2;  	struct btrfs_key key_start;  	struct btrfs_key key_end; -  	u64 increment = map->stripe_len;  	u64 offset; +	u64 extent_logical; +	u64 extent_physical; +	u64 extent_len; +	struct btrfs_device *extent_dev; +	int extent_mirror_num;  	nstripes = length;  	offset = 0; @@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	 */  	logical = base + offset; -	wait_event(sdev->list_wait, -		   atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, +		   atomic_read(&sctx->bios_in_flight) == 0);  	atomic_inc(&fs_info->scrubs_paused);  	wake_up(&fs_info->scrub_pause_wait); @@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		 * canceled?  		 */  		if (atomic_read(&fs_info->scrub_cancel_req) || -		    atomic_read(&sdev->cancel_req)) { +		    atomic_read(&sctx->cancel_req)) {  			ret = -ECANCELED;  			goto out;  		} @@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		 */  		if (atomic_read(&fs_info->scrub_pause_req)) {  			/* push queued extents */ -			scrub_submit(sdev); -			wait_event(sdev->list_wait, -				   atomic_read(&sdev->in_flight) == 0); +			atomic_set(&sctx->wr_ctx.flush_all_writes, 1); +			scrub_submit(sctx); +			mutex_lock(&sctx->wr_ctx.wr_lock); +			scrub_wr_submit(sctx); +			mutex_unlock(&sctx->wr_ctx.wr_lock); +			wait_event(sctx->list_wait, +				   atomic_read(&sctx->bios_in_flight) == 0); +			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);  			atomic_inc(&fs_info->scrubs_paused);  			wake_up(&fs_info->scrub_pause_wait);  			mutex_lock(&fs_info->scrub_lock); @@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		ret = btrfs_lookup_csums_range(csum_root, logical,  					       logical + map->stripe_len - 1, -					       &sdev->csum_list, 1); +					       &sctx->csum_list, 1);  		if (ret)  			goto out; @@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  					     key.objectid;  			} -			ret = scrub_extent(sdev, key.objectid, key.offset, -					   key.objectid - logical + physical, -					   flags, generation, mirror_num); +			extent_logical = key.objectid; +			extent_physical = key.objectid - logical + physical; +			extent_len = key.offset; +			extent_dev = scrub_dev; +			extent_mirror_num = mirror_num; +			if (is_dev_replace) +				scrub_remap_extent(fs_info, extent_logical, +						   extent_len, &extent_physical, +						   &extent_dev, +						   &extent_mirror_num); +			ret = scrub_extent(sctx, extent_logical, extent_len, +					   extent_physical, extent_dev, flags, +					   generation, extent_mirror_num, +					   key.objectid - logical + physical);  			if (ret)  				goto out; @@ -2016,29 +2477,34 @@ next:  		btrfs_release_path(path);  		logical += increment;  		physical += map->stripe_len; -		spin_lock(&sdev->stat_lock); -		sdev->stat.last_physical = physical; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.last_physical = physical; +		spin_unlock(&sctx->stat_lock);  	} +out:  	/* push queued extents */ -	scrub_submit(sdev); +	scrub_submit(sctx); +	mutex_lock(&sctx->wr_ctx.wr_lock); +	scrub_wr_submit(sctx); +	mutex_unlock(&sctx->wr_ctx.wr_lock); -out:  	blk_finish_plug(&plug);  	btrfs_free_path(path);  	return ret < 0 ? ret : 0;  } -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, -	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, -	u64 dev_offset) +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, +					  struct btrfs_device *scrub_dev, +					  u64 chunk_tree, u64 chunk_objectid, +					  u64 chunk_offset, u64 length, +					  u64 dev_offset, int is_dev_replace)  {  	struct btrfs_mapping_tree *map_tree = -		&sdev->dev->dev_root->fs_info->mapping_tree; +		&sctx->dev_root->fs_info->mapping_tree;  	struct map_lookup *map;  	struct extent_map *em;  	int i; -	int ret = -EINVAL; +	int ret = 0;  	read_lock(&map_tree->map_tree.lock);  	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); @@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,  		goto out;  	for (i = 0; i < map->num_stripes; ++i) { -		if (map->stripes[i].dev == sdev->dev && +		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&  		    map->stripes[i].physical == dev_offset) { -			ret = scrub_stripe(sdev, map, i, chunk_offset, length); +			ret = scrub_stripe(sctx, map, scrub_dev, i, +					   chunk_offset, length, +					   is_dev_replace);  			if (ret)  				goto out;  		} @@ -2069,11 +2537,13 @@ out:  }  static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, +			   struct btrfs_device *scrub_dev, u64 start, u64 end, +			   int is_dev_replace)  {  	struct btrfs_dev_extent *dev_extent = NULL;  	struct btrfs_path *path; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u64 length;  	u64 chunk_tree; @@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	struct btrfs_key key;  	struct btrfs_key found_key;  	struct btrfs_block_group_cache *cache; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	path = btrfs_alloc_path();  	if (!path) @@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	path->search_commit_root = 1;  	path->skip_locking = 1; -	key.objectid = sdev->dev->devid; +	key.objectid = scrub_dev->devid;  	key.offset = 0ull;  	key.type = BTRFS_DEV_EXTENT_KEY; -  	while (1) {  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  		if (ret < 0) @@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  		btrfs_item_key_to_cpu(l, &found_key, slot); -		if (found_key.objectid != sdev->dev->devid) +		if (found_key.objectid != scrub_dev->devid)  			break;  		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  			ret = -ENOENT;  			break;  		} -		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, -				  chunk_offset, length, found_key.offset); +		dev_replace->cursor_right = found_key.offset + length; +		dev_replace->cursor_left = found_key.offset; +		dev_replace->item_needs_writeback = 1; +		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, +				  chunk_offset, length, found_key.offset, +				  is_dev_replace); + +		/* +		 * flush, submit all pending read and write bios, afterwards +		 * wait for them. +		 * Note that in the dev replace case, a read request causes +		 * write requests that are submitted in the read completion +		 * worker. Therefore in the current situation, it is required +		 * that all write requests are flushed, so that all read and +		 * write requests are really completed when bios_in_flight +		 * changes to 0. +		 */ +		atomic_set(&sctx->wr_ctx.flush_all_writes, 1); +		scrub_submit(sctx); +		mutex_lock(&sctx->wr_ctx.wr_lock); +		scrub_wr_submit(sctx); +		mutex_unlock(&sctx->wr_ctx.wr_lock); + +		wait_event(sctx->list_wait, +			   atomic_read(&sctx->bios_in_flight) == 0); +		atomic_set(&sctx->wr_ctx.flush_all_writes, 0); +		atomic_inc(&fs_info->scrubs_paused); +		wake_up(&fs_info->scrub_pause_wait); +		wait_event(sctx->list_wait, +			   atomic_read(&sctx->workers_pending) == 0); + +		mutex_lock(&fs_info->scrub_lock); +		while (atomic_read(&fs_info->scrub_pause_req)) { +			mutex_unlock(&fs_info->scrub_lock); +			wait_event(fs_info->scrub_pause_wait, +			   atomic_read(&fs_info->scrub_pause_req) == 0); +			mutex_lock(&fs_info->scrub_lock); +		} +		atomic_dec(&fs_info->scrubs_paused); +		mutex_unlock(&fs_info->scrub_lock); +		wake_up(&fs_info->scrub_pause_wait); + +		dev_replace->cursor_left = dev_replace->cursor_right; +		dev_replace->item_needs_writeback = 1;  		btrfs_put_block_group(cache);  		if (ret)  			break; +		if (is_dev_replace && +		    atomic64_read(&dev_replace->num_write_errors) > 0) { +			ret = -EIO; +			break; +		} +		if (sctx->stat.malloc_errors > 0) { +			ret = -ENOMEM; +			break; +		}  		key.offset = found_key.offset + length;  		btrfs_release_path(path); @@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	return ret < 0 ? ret : 0;  } -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, +					   struct btrfs_device *scrub_dev)  {  	int	i;  	u64	bytenr;  	u64	gen;  	int	ret; -	struct btrfs_device *device = sdev->dev; -	struct btrfs_root *root = device->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)  		return -EIO; @@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {  		bytenr = btrfs_sb_offset(i); -		if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) +		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)  			break; -		ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, -				     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); +		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, +				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, +				  NULL, 1, bytenr);  		if (ret)  			return ret;  	} -	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);  	return 0;  } @@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)  /*   * get a reference count on fs_info->scrub_workers. start worker if necessary   */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, +						int is_dev_replace)  { -	struct btrfs_fs_info *fs_info = root->fs_info;  	int ret = 0;  	mutex_lock(&fs_info->scrub_lock);  	if (fs_info->scrub_workers_refcnt == 0) { -		btrfs_init_workers(&fs_info->scrub_workers, "scrub", -			   fs_info->thread_pool_size, &fs_info->generic_worker); +		if (is_dev_replace) +			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, +					&fs_info->generic_worker); +		else +			btrfs_init_workers(&fs_info->scrub_workers, "scrub", +					fs_info->thread_pool_size, +					&fs_info->generic_worker);  		fs_info->scrub_workers.idle_thresh = 4;  		ret = btrfs_start_workers(&fs_info->scrub_workers);  		if (ret)  			goto out; +		btrfs_init_workers(&fs_info->scrub_wr_completion_workers, +				   "scrubwrc", +				   fs_info->thread_pool_size, +				   &fs_info->generic_worker); +		fs_info->scrub_wr_completion_workers.idle_thresh = 2; +		ret = btrfs_start_workers( +				&fs_info->scrub_wr_completion_workers); +		if (ret) +			goto out; +		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, +				   &fs_info->generic_worker); +		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); +		if (ret) +			goto out;  	}  	++fs_info->scrub_workers_refcnt;  out: @@ -2223,40 +2764,41 @@ out:  	return ret;  } -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)  { -	struct btrfs_fs_info *fs_info = root->fs_info; -  	mutex_lock(&fs_info->scrub_lock); -	if (--fs_info->scrub_workers_refcnt == 0) +	if (--fs_info->scrub_workers_refcnt == 0) {  		btrfs_stop_workers(&fs_info->scrub_workers); +		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); +		btrfs_stop_workers(&fs_info->scrub_nocow_workers); +	}  	WARN_ON(fs_info->scrub_workers_refcnt < 0);  	mutex_unlock(&fs_info->scrub_lock);  } - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, -		    struct btrfs_scrub_progress *progress, int readonly) +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, +		    u64 end, struct btrfs_scrub_progress *progress, +		    int readonly, int is_dev_replace)  { -	struct scrub_dev *sdev; -	struct btrfs_fs_info *fs_info = root->fs_info; +	struct scrub_ctx *sctx;  	int ret;  	struct btrfs_device *dev; -	if (btrfs_fs_closing(root->fs_info)) +	if (btrfs_fs_closing(fs_info))  		return -EINVAL;  	/*  	 * check some assumptions  	 */ -	if (root->nodesize != root->leafsize) { +	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {  		printk(KERN_ERR  		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", -		       root->nodesize, root->leafsize); +		       fs_info->chunk_root->nodesize, +		       fs_info->chunk_root->leafsize);  		return -EINVAL;  	} -	if (root->nodesize > BTRFS_STRIPE_LEN) { +	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {  		/*  		 * in this case scrub is unable to calculate the checksum  		 * the way scrub is implemented. Do not handle this @@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,  		 */  		printk(KERN_ERR  		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", -		       root->nodesize, BTRFS_STRIPE_LEN); +		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);  		return -EINVAL;  	} -	if (root->sectorsize != PAGE_SIZE) { +	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {  		/* not supported for data w/o checksums */  		printk(KERN_ERR  		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", -		       root->sectorsize, (unsigned long long)PAGE_SIZE); +		       fs_info->chunk_root->sectorsize, +		       (unsigned long long)PAGE_SIZE);  		return -EINVAL;  	} -	ret = scrub_workers_get(root); +	if (fs_info->chunk_root->nodesize > +	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || +	    fs_info->chunk_root->sectorsize > +	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { +		/* +		 * would exhaust the array bounds of pagev member in +		 * struct scrub_block +		 */ +		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", +		       fs_info->chunk_root->nodesize, +		       SCRUB_MAX_PAGES_PER_BLOCK, +		       fs_info->chunk_root->sectorsize, +		       SCRUB_MAX_PAGES_PER_BLOCK); +		return -EINVAL; +	} + +	ret = scrub_workers_get(fs_info, is_dev_replace);  	if (ret)  		return ret; -	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); -	if (!dev || dev->missing) { -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); +	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	dev = btrfs_find_device(fs_info, devid, NULL, NULL); +	if (!dev || (dev->missing && !is_dev_replace)) { +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info);  		return -ENODEV;  	}  	mutex_lock(&fs_info->scrub_lock); -	if (!dev->in_fs_metadata) { +	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); -		return -ENODEV; +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info); +		return -EIO;  	} -	if (dev->scrub_device) { +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (dev->scrub_device || +	    (!is_dev_replace && +	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { +		btrfs_dev_replace_unlock(&fs_info->dev_replace);  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info);  		return -EINPROGRESS;  	} -	sdev = scrub_setup_dev(dev); -	if (IS_ERR(sdev)) { +	btrfs_dev_replace_unlock(&fs_info->dev_replace); +	sctx = scrub_setup_ctx(dev, is_dev_replace); +	if (IS_ERR(sctx)) {  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); -		return PTR_ERR(sdev); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info); +		return PTR_ERR(sctx);  	} -	sdev->readonly = readonly; -	dev->scrub_device = sdev; +	sctx->readonly = readonly; +	dev->scrub_device = sctx;  	atomic_inc(&fs_info->scrubs_running);  	mutex_unlock(&fs_info->scrub_lock); -	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); -	down_read(&fs_info->scrub_super_lock); -	ret = scrub_supers(sdev); -	up_read(&fs_info->scrub_super_lock); +	if (!is_dev_replace) { +		down_read(&fs_info->scrub_super_lock); +		ret = scrub_supers(sctx, dev); +		up_read(&fs_info->scrub_super_lock); +	}  	if (!ret) -		ret = scrub_enumerate_chunks(sdev, start, end); +		ret = scrub_enumerate_chunks(sctx, dev, start, end, +					     is_dev_replace); -	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);  	atomic_dec(&fs_info->scrubs_running);  	wake_up(&fs_info->scrub_pause_wait); -	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);  	if (progress) -		memcpy(progress, &sdev->stat, sizeof(*progress)); +		memcpy(progress, &sctx->stat, sizeof(*progress));  	mutex_lock(&fs_info->scrub_lock);  	dev->scrub_device = NULL;  	mutex_unlock(&fs_info->scrub_lock); -	scrub_free_dev(sdev); -	scrub_workers_put(root); +	scrub_free_ctx(sctx); +	scrub_workers_put(fs_info);  	return ret;  } @@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)  	up_write(&root->fs_info->scrub_super_lock);  } -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)  { -  	mutex_lock(&fs_info->scrub_lock);  	if (!atomic_read(&fs_info->scrubs_running)) {  		mutex_unlock(&fs_info->scrub_lock); @@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)  	return 0;  } -int btrfs_scrub_cancel(struct btrfs_root *root) +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, +			   struct btrfs_device *dev)  { -	return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ -	struct btrfs_fs_info *fs_info = root->fs_info; -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	mutex_lock(&fs_info->scrub_lock); -	sdev = dev->scrub_device; -	if (!sdev) { +	sctx = dev->scrub_device; +	if (!sctx) {  		mutex_unlock(&fs_info->scrub_lock);  		return -ENOTCONN;  	} -	atomic_inc(&sdev->cancel_req); +	atomic_inc(&sctx->cancel_req);  	while (dev->scrub_device) {  		mutex_unlock(&fs_info->scrub_lock);  		wait_event(fs_info->scrub_pause_wait, @@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)  	 * does not go away in cancel_dev. FIXME: find a better solution  	 */  	mutex_lock(&fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); +	dev = btrfs_find_device(fs_info, devid, NULL, NULL);  	if (!dev) {  		mutex_unlock(&fs_info->fs_devices->device_list_mutex);  		return -ENODEV;  	} -	ret = btrfs_scrub_cancel_dev(root, dev); +	ret = btrfs_scrub_cancel_dev(fs_info, dev);  	mutex_unlock(&fs_info->fs_devices->device_list_mutex);  	return ret; @@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,  			 struct btrfs_scrub_progress *progress)  {  	struct btrfs_device *dev; -	struct scrub_dev *sdev = NULL; +	struct scrub_ctx *sctx = NULL;  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); +	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);  	if (dev) -		sdev = dev->scrub_device; -	if (sdev) -		memcpy(progress, &sdev->stat, sizeof(*progress)); +		sctx = dev->scrub_device; +	if (sctx) +		memcpy(progress, &sctx->stat, sizeof(*progress));  	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; +	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; +} + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, +			       u64 extent_logical, u64 extent_len, +			       u64 *extent_physical, +			       struct btrfs_device **extent_dev, +			       int *extent_mirror_num) +{ +	u64 mapped_length; +	struct btrfs_bio *bbio = NULL; +	int ret; + +	mapped_length = extent_len; +	ret = btrfs_map_block(fs_info, READ, extent_logical, +			      &mapped_length, &bbio, 0); +	if (ret || !bbio || mapped_length < extent_len || +	    !bbio->stripes[0].dev->bdev) { +		kfree(bbio); +		return; +	} + +	*extent_physical = bbio->stripes[0].physical; +	*extent_mirror_num = bbio->mirror_num; +	*extent_dev = bbio->stripes[0].dev; +	kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, +			      struct scrub_wr_ctx *wr_ctx, +			      struct btrfs_fs_info *fs_info, +			      struct btrfs_device *dev, +			      int is_dev_replace) +{ +	WARN_ON(wr_ctx->wr_curr_bio != NULL); + +	mutex_init(&wr_ctx->wr_lock); +	wr_ctx->wr_curr_bio = NULL; +	if (!is_dev_replace) +		return 0; + +	WARN_ON(!dev->bdev); +	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, +					 bio_get_nr_vecs(dev->bdev)); +	wr_ctx->tgtdev = dev; +	atomic_set(&wr_ctx->flush_all_writes, 0); +	return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ +	mutex_lock(&wr_ctx->wr_lock); +	kfree(wr_ctx->wr_curr_bio); +	wr_ctx->wr_curr_bio = NULL; +	mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +			    int mirror_num, u64 physical_for_dev_replace) +{ +	struct scrub_copy_nocow_ctx *nocow_ctx; +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + +	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); +	if (!nocow_ctx) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		return -ENOMEM; +	} + +	scrub_pending_trans_workers_inc(sctx); + +	nocow_ctx->sctx = sctx; +	nocow_ctx->logical = logical; +	nocow_ctx->len = len; +	nocow_ctx->mirror_num = mirror_num; +	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; +	nocow_ctx->work.func = copy_nocow_pages_worker; +	btrfs_queue_worker(&fs_info->scrub_nocow_workers, +			   &nocow_ctx->work); + +	return 0; +} + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ +	struct scrub_copy_nocow_ctx *nocow_ctx = +		container_of(work, struct scrub_copy_nocow_ctx, work); +	struct scrub_ctx *sctx = nocow_ctx->sctx; +	u64 logical = nocow_ctx->logical; +	u64 len = nocow_ctx->len; +	int mirror_num = nocow_ctx->mirror_num; +	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; +	int ret; +	struct btrfs_trans_handle *trans = NULL; +	struct btrfs_fs_info *fs_info; +	struct btrfs_path *path; +	struct btrfs_root *root; +	int not_written = 0; + +	fs_info = sctx->dev_root->fs_info; +	root = fs_info->extent_root; + +	path = btrfs_alloc_path(); +	if (!path) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		not_written = 1; +		goto out; +	} + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		not_written = 1; +		goto out; +	} + +	ret = iterate_inodes_from_logical(logical, fs_info, path, +					  copy_nocow_pages_for_inode, +					  nocow_ctx); +	if (ret != 0 && ret != -ENOENT) { +		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", +			(unsigned long long)logical, +			(unsigned long long)physical_for_dev_replace, +			(unsigned long long)len, +			(unsigned long long)mirror_num, ret); +		not_written = 1; +		goto out; +	} + +out: +	if (trans && !IS_ERR(trans)) +		btrfs_end_transaction(trans, root); +	if (not_written) +		btrfs_dev_replace_stats_inc(&fs_info->dev_replace. +					    num_uncorrectable_read_errors); + +	btrfs_free_path(path); +	kfree(nocow_ctx); + +	scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ +	unsigned long index; +	struct scrub_copy_nocow_ctx *nocow_ctx = ctx; +	int ret = 0; +	struct btrfs_key key; +	struct inode *inode = NULL; +	struct btrfs_root *local_root; +	u64 physical_for_dev_replace; +	u64 len; +	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + +	key.objectid = root; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; +	local_root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(local_root)) +		return PTR_ERR(local_root); + +	key.type = BTRFS_INODE_ITEM_KEY; +	key.objectid = inum; +	key.offset = 0; +	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; +	len = nocow_ctx->len; +	while (len >= PAGE_CACHE_SIZE) { +		struct page *page = NULL; +		int ret_sub; + +		index = offset >> PAGE_CACHE_SHIFT; + +		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); +		if (!page) { +			pr_err("find_or_create_page() failed\n"); +			ret = -ENOMEM; +			goto next_page; +		} + +		if (PageUptodate(page)) { +			if (PageDirty(page)) +				goto next_page; +		} else { +			ClearPageError(page); +			ret_sub = extent_read_full_page(&BTRFS_I(inode)-> +							 io_tree, +							page, btrfs_get_extent, +							nocow_ctx->mirror_num); +			if (ret_sub) { +				ret = ret_sub; +				goto next_page; +			} +			wait_on_page_locked(page); +			if (!PageUptodate(page)) { +				ret = -EIO; +				goto next_page; +			} +		} +		ret_sub = write_page_nocow(nocow_ctx->sctx, +					   physical_for_dev_replace, page); +		if (ret_sub) { +			ret = ret_sub; +			goto next_page; +		} + +next_page: +		if (page) { +			unlock_page(page); +			put_page(page); +		} +		offset += PAGE_CACHE_SIZE; +		physical_for_dev_replace += PAGE_CACHE_SIZE; +		len -= PAGE_CACHE_SIZE; +	} + +	if (inode) +		iput(inode); +	return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, +			    u64 physical_for_dev_replace, struct page *page) +{ +	struct bio *bio; +	struct btrfs_device *dev; +	int ret; +	DECLARE_COMPLETION_ONSTACK(compl); + +	dev = sctx->wr_ctx.tgtdev; +	if (!dev) +		return -EIO; +	if (!dev->bdev) { +		printk_ratelimited(KERN_WARNING +			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); +		return -EIO; +	} +	bio = bio_alloc(GFP_NOFS, 1); +	if (!bio) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		return -ENOMEM; +	} +	bio->bi_private = &compl; +	bio->bi_end_io = scrub_complete_bio_end_io; +	bio->bi_size = 0; +	bio->bi_sector = physical_for_dev_replace >> 9; +	bio->bi_bdev = dev->bdev; +	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +	if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: +		bio_put(bio); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); +		return -EIO; +	} +	btrfsic_submit_bio(WRITE_SYNC, bio); +	wait_for_completion(&compl); + +	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) +		goto leave_with_eio; + +	bio_put(bio); +	return 0;  }  |