diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
| -rw-r--r-- | fs/btrfs/scrub.c | 668 | 
1 files changed, 419 insertions, 249 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3afe5fa50a63..f260c53829e5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -54,6 +54,8 @@ struct scrub_ctx;   */  #define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_PAGES			(DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +  struct scrub_recover {  	refcount_t		refs;  	struct btrfs_io_context	*bioc; @@ -62,16 +64,12 @@ struct scrub_recover {  struct scrub_sector {  	struct scrub_block	*sblock; -	struct page		*page; -	struct btrfs_device	*dev;  	struct list_head	list;  	u64			flags;  /* extent flags */  	u64			generation; -	u64			logical; -	u64			physical; -	u64			physical_for_dev_replace; +	/* Offset in bytes to @sblock. */ +	u32			offset;  	atomic_t		refs; -	u8			mirror_num;  	unsigned int		have_csum:1;  	unsigned int		io_error:1;  	u8			csum[BTRFS_CSUM_SIZE]; @@ -94,8 +92,22 @@ struct scrub_bio {  };  struct scrub_block { +	/* +	 * Each page will have its page::private used to record the logical +	 * bytenr. +	 */ +	struct page		*pages[SCRUB_MAX_PAGES];  	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; +	struct btrfs_device	*dev; +	/* Logical bytenr of the sblock */ +	u64			logical; +	u64			physical; +	u64			physical_for_dev_replace; +	/* Length of sblock in bytes */ +	u32			len;  	int			sector_count; +	int			mirror_num; +  	atomic_t		outstanding_sectors;  	refcount_t		refs; /* free mem on transition to zero */  	struct scrub_ctx	*sctx; @@ -202,8 +214,174 @@ struct full_stripe_lock {  	struct mutex mutex;  }; +#ifndef CONFIG_64BIT +/* This structure is for archtectures whose (void *) is smaller than u64 */ +struct scrub_page_private { +	u64 logical; +}; +#endif + +static int attach_scrub_page_private(struct page *page, u64 logical) +{ +#ifdef CONFIG_64BIT +	attach_page_private(page, (void *)logical); +	return 0; +#else +	struct scrub_page_private *spp; + +	spp = kmalloc(sizeof(*spp), GFP_KERNEL); +	if (!spp) +		return -ENOMEM; +	spp->logical = logical; +	attach_page_private(page, (void *)spp); +	return 0; +#endif +} + +static void detach_scrub_page_private(struct page *page) +{ +#ifdef CONFIG_64BIT +	detach_page_private(page); +	return; +#else +	struct scrub_page_private *spp; + +	spp = detach_page_private(page); +	kfree(spp); +	return; +#endif +} + +static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, +					     struct btrfs_device *dev, +					     u64 logical, u64 physical, +					     u64 physical_for_dev_replace, +					     int mirror_num) +{ +	struct scrub_block *sblock; + +	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); +	if (!sblock) +		return NULL; +	refcount_set(&sblock->refs, 1); +	sblock->sctx = sctx; +	sblock->logical = logical; +	sblock->physical = physical; +	sblock->physical_for_dev_replace = physical_for_dev_replace; +	sblock->dev = dev; +	sblock->mirror_num = mirror_num; +	sblock->no_io_error_seen = 1; +	/* +	 * Scrub_block::pages will be allocated at alloc_scrub_sector() when +	 * the corresponding page is not allocated. +	 */ +	return sblock; +} + +/* + * Allocate a new scrub sector and attach it to @sblock. + * + * Will also allocate new pages for @sblock if needed. + */ +static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, +					       u64 logical, gfp_t gfp) +{ +	const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; +	struct scrub_sector *ssector; + +	/* We must never have scrub_block exceed U32_MAX in size. */ +	ASSERT(logical - sblock->logical < U32_MAX); + +	ssector = kzalloc(sizeof(*ssector), gfp); +	if (!ssector) +		return NULL; + +	/* Allocate a new page if the slot is not allocated */ +	if (!sblock->pages[page_index]) { +		int ret; + +		sblock->pages[page_index] = alloc_page(gfp); +		if (!sblock->pages[page_index]) { +			kfree(ssector); +			return NULL; +		} +		ret = attach_scrub_page_private(sblock->pages[page_index], +				sblock->logical + (page_index << PAGE_SHIFT)); +		if (ret < 0) { +			kfree(ssector); +			__free_page(sblock->pages[page_index]); +			sblock->pages[page_index] = NULL; +			return NULL; +		} +	} + +	atomic_set(&ssector->refs, 1); +	ssector->sblock = sblock; +	/* The sector to be added should not be used */ +	ASSERT(sblock->sectors[sblock->sector_count] == NULL); +	ssector->offset = logical - sblock->logical; + +	/* The sector count must be smaller than the limit */ +	ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); + +	sblock->sectors[sblock->sector_count] = ssector; +	sblock->sector_count++; +	sblock->len += sblock->sctx->fs_info->sectorsize; + +	return ssector; +} + +static struct page *scrub_sector_get_page(struct scrub_sector *ssector) +{ +	struct scrub_block *sblock = ssector->sblock; +	pgoff_t index; +	/* +	 * When calling this function, ssector must be alreaday attached to the +	 * parent sblock. +	 */ +	ASSERT(sblock); + +	/* The range should be inside the sblock range */ +	ASSERT(ssector->offset < sblock->len); + +	index = ssector->offset >> PAGE_SHIFT; +	ASSERT(index < SCRUB_MAX_PAGES); +	ASSERT(sblock->pages[index]); +	ASSERT(PagePrivate(sblock->pages[index])); +	return sblock->pages[index]; +} + +static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +{ +	struct scrub_block *sblock = ssector->sblock; + +	/* +	 * When calling this function, ssector must be already attached to the +	 * parent sblock. +	 */ +	ASSERT(sblock); + +	/* The range should be inside the sblock range */ +	ASSERT(ssector->offset < sblock->len); + +	return offset_in_page(ssector->offset); +} + +static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) +{ +	return page_address(scrub_sector_get_page(ssector)) + +	       scrub_sector_get_page_offset(ssector); +} + +static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, +				unsigned int len) +{ +	return bio_add_page(bio, scrub_sector_get_page(ssector), len, +			    scrub_sector_get_page_offset(ssector)); +} +  static int scrub_setup_recheck_block(struct scrub_block *original_sblock, -				     struct scrub_block *sblocks_for_recheck); +				     struct scrub_block *sblocks_for_recheck[]);  static void scrub_recheck_block(struct btrfs_fs_info *fs_info,  				struct scrub_block *sblock,  				int retry_failed_mirror); @@ -533,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)  	if (sctx->curr != -1) {  		struct scrub_bio *sbio = sctx->bios[sctx->curr]; -		for (i = 0; i < sbio->sector_count; i++) { -			WARN_ON(!sbio->sectors[i]->page); +		for (i = 0; i < sbio->sector_count; i++)  			scrub_block_put(sbio->sectors[i]->sblock); -		}  		bio_put(sbio->bio);  	} @@ -726,15 +902,22 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	int ret;  	WARN_ON(sblock->sector_count < 1); -	dev = sblock->sectors[0]->dev; +	dev = sblock->dev;  	fs_info = sblock->sctx->fs_info; +	/* Super block error, no need to search extent tree. */ +	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { +		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", +			errstr, rcu_str_deref(dev->name), +			sblock->physical); +		return; +	}  	path = btrfs_alloc_path();  	if (!path)  		return; -	swarn.physical = sblock->sectors[0]->physical; -	swarn.logical = sblock->sectors[0]->logical; +	swarn.physical = sblock->physical; +	swarn.logical = sblock->logical;  	swarn.errstr = errstr;  	swarn.dev = NULL; @@ -804,13 +987,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  {  	struct scrub_ctx *sctx = sblock_to_check->sctx; -	struct btrfs_device *dev; +	struct btrfs_device *dev = sblock_to_check->dev;  	struct btrfs_fs_info *fs_info;  	u64 logical;  	unsigned int failed_mirror_index;  	unsigned int is_metadata;  	unsigned int have_csum; -	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ +	/* One scrub_block for each mirror */ +	struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };  	struct scrub_block *sblock_bad;  	int ret;  	int mirror_index; @@ -825,22 +1009,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	fs_info = sctx->fs_info;  	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {  		/* -		 * if we find an error in a super block, we just report it. +		 * If we find an error in a super block, we just report it.  		 * They will get written with the next transaction commit  		 * anyway  		 */ +		scrub_print_warning("super block error", sblock_to_check);  		spin_lock(&sctx->stat_lock);  		++sctx->stat.super_errors;  		spin_unlock(&sctx->stat_lock); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);  		return 0;  	} -	logical = sblock_to_check->sectors[0]->logical; -	BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); -	failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; +	logical = sblock_to_check->logical; +	ASSERT(sblock_to_check->mirror_num); +	failed_mirror_index = sblock_to_check->mirror_num - 1;  	is_metadata = !(sblock_to_check->sectors[0]->flags &  			BTRFS_EXTENT_FLAG_DATA);  	have_csum = sblock_to_check->sectors[0]->have_csum; -	dev = sblock_to_check->sectors[0]->dev;  	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))  		return 0; @@ -902,17 +1087,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	 * repaired area is verified in order to correctly maintain  	 * the statistics.  	 */ - -	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, -				      sizeof(*sblocks_for_recheck), GFP_KERNEL); -	if (!sblocks_for_recheck) { -		spin_lock(&sctx->stat_lock); -		sctx->stat.malloc_errors++; -		sctx->stat.read_errors++; -		sctx->stat.uncorrectable_errors++; -		spin_unlock(&sctx->stat_lock); -		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); -		goto out; +	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { +		/* +		 * Note: the two members refs and outstanding_sectors are not +		 * used in the blocks that are used for the recheck procedure. +		 * +		 * But alloc_scrub_block() will initialize sblock::ref anyway, +		 * so we can use scrub_block_put() to clean them up. +		 * +		 * And here we don't setup the physical/dev for the sblock yet, +		 * they will be correctly initialized in scrub_setup_recheck_block(). +		 */ +		sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, +							logical, 0, 0, mirror_index); +		if (!sblocks_for_recheck[mirror_index]) { +			spin_lock(&sctx->stat_lock); +			sctx->stat.malloc_errors++; +			sctx->stat.read_errors++; +			sctx->stat.uncorrectable_errors++; +			spin_unlock(&sctx->stat_lock); +			btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); +			goto out; +		}  	}  	/* Setup the context, map the logical blocks and alloc the sectors */ @@ -926,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		goto out;  	}  	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); -	sblock_bad = sblocks_for_recheck + failed_mirror_index; +	sblock_bad = sblocks_for_recheck[failed_mirror_index];  	/* build and submit the bios for the failed mirror, check checksums */  	scrub_recheck_block(fs_info, sblock_bad, 1); @@ -1011,22 +1207,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {  			if (mirror_index >= BTRFS_MAX_MIRRORS)  				break; -			if (!sblocks_for_recheck[mirror_index].sector_count) +			if (!sblocks_for_recheck[mirror_index]->sector_count)  				break; -			sblock_other = sblocks_for_recheck + mirror_index; +			sblock_other = sblocks_for_recheck[mirror_index];  		} else {  			struct scrub_recover *r = sblock_bad->sectors[0]->recover;  			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;  			if (mirror_index >= max_allowed)  				break; -			if (!sblocks_for_recheck[1].sector_count) +			if (!sblocks_for_recheck[1]->sector_count)  				break;  			ASSERT(failed_mirror_index == 0); -			sblock_other = sblocks_for_recheck + 1; -			sblock_other->sectors[0]->mirror_num = 1 + mirror_index; +			sblock_other = sblocks_for_recheck[1]; +			sblock_other->mirror_num = 1 + mirror_index;  		}  		/* build and submit the bios, check checksums */ @@ -1097,12 +1293,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  			/* Try to find no-io-error sector in mirrors */  			for (mirror_index = 0;  			     mirror_index < BTRFS_MAX_MIRRORS && -			     sblocks_for_recheck[mirror_index].sector_count > 0; +			     sblocks_for_recheck[mirror_index]->sector_count > 0;  			     mirror_index++) { -				if (!sblocks_for_recheck[mirror_index]. +				if (!sblocks_for_recheck[mirror_index]->  				    sectors[sector_num]->io_error) { -					sblock_other = sblocks_for_recheck + -						       mirror_index; +					sblock_other = sblocks_for_recheck[mirror_index];  					break;  				}  			} @@ -1176,25 +1371,28 @@ did_not_correct_error:  	}  out: -	if (sblocks_for_recheck) { -		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; -		     mirror_index++) { -			struct scrub_block *sblock = sblocks_for_recheck + -						     mirror_index; -			struct scrub_recover *recover; -			int i; - -			for (i = 0; i < sblock->sector_count; i++) { -				sblock->sectors[i]->sblock = NULL; -				recover = sblock->sectors[i]->recover; -				if (recover) { -					scrub_put_recover(fs_info, recover); -					sblock->sectors[i]->recover = NULL; -				} -				scrub_sector_put(sblock->sectors[i]); +	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { +		struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; +		struct scrub_recover *recover; +		int sector_index; + +		/* Not allocated, continue checking the next mirror */ +		if (!sblock) +			continue; + +		for (sector_index = 0; sector_index < sblock->sector_count; +		     sector_index++) { +			/* +			 * Here we just cleanup the recover, each sector will be +			 * properly cleaned up by later scrub_block_put() +			 */ +			recover = sblock->sectors[sector_index]->recover; +			if (recover) { +				scrub_put_recover(fs_info, recover); +				sblock->sectors[sector_index]->recover = NULL;  			}  		} -		kfree(sblocks_for_recheck); +		scrub_block_put(sblock);  	}  	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); @@ -1244,12 +1442,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,  }  static int scrub_setup_recheck_block(struct scrub_block *original_sblock, -				     struct scrub_block *sblocks_for_recheck) +				     struct scrub_block *sblocks_for_recheck[])  {  	struct scrub_ctx *sctx = original_sblock->sctx;  	struct btrfs_fs_info *fs_info = sctx->fs_info; +	u64 logical = original_sblock->logical;  	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; -	u64 logical = original_sblock->sectors[0]->logical;  	u64 generation = original_sblock->sectors[0]->generation;  	u64 flags = original_sblock->sectors[0]->flags;  	u64 have_csum = original_sblock->sectors[0]->have_csum; @@ -1264,11 +1462,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,  	int nmirrors;  	int ret; -	/* -	 * Note: the two members refs and outstanding_sectors are not used (and -	 * not set) in the blocks that are used for the recheck procedure. -	 */ -  	while (length > 0) {  		sublen = min_t(u64, length, fs_info->sectorsize);  		mapped_length = sublen; @@ -1307,24 +1500,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,  			struct scrub_block *sblock;  			struct scrub_sector *sector; -			sblock = sblocks_for_recheck + mirror_index; +			sblock = sblocks_for_recheck[mirror_index];  			sblock->sctx = sctx; -			sector = kzalloc(sizeof(*sector), GFP_NOFS); +			sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);  			if (!sector) { -leave_nomem:  				spin_lock(&sctx->stat_lock);  				sctx->stat.malloc_errors++;  				spin_unlock(&sctx->stat_lock);  				scrub_put_recover(fs_info, recover);  				return -ENOMEM;  			} -			scrub_sector_get(sector); -			sblock->sectors[sector_index] = sector; -			sector->sblock = sblock;  			sector->flags = flags;  			sector->generation = generation; -			sector->logical = logical;  			sector->have_csum = have_csum;  			if (have_csum)  				memcpy(sector->csum, @@ -1339,21 +1527,20 @@ leave_nomem:  						      mirror_index,  						      &stripe_index,  						      &stripe_offset); -			sector->physical = bioc->stripes[stripe_index].physical + -					 stripe_offset; -			sector->dev = bioc->stripes[stripe_index].dev; +			/* +			 * We're at the first sector, also populate @sblock +			 * physical and dev. +			 */ +			if (sector_index == 0) { +				sblock->physical = +					bioc->stripes[stripe_index].physical + +					stripe_offset; +				sblock->dev = bioc->stripes[stripe_index].dev; +				sblock->physical_for_dev_replace = +					original_sblock->physical_for_dev_replace; +			}  			BUG_ON(sector_index >= original_sblock->sector_count); -			sector->physical_for_dev_replace = -				original_sblock->sectors[sector_index]-> -				physical_for_dev_replace; -			/* For missing devices, dev->bdev is NULL */ -			sector->mirror_num = mirror_index + 1; -			sblock->sector_count++; -			sector->page = alloc_page(GFP_NOFS); -			if (!sector->page) -				goto leave_nomem; -  			scrub_get_recover(recover);  			sector->recover = recover;  		} @@ -1377,11 +1564,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,  {  	DECLARE_COMPLETION_ONSTACK(done); -	bio->bi_iter.bi_sector = sector->logical >> 9; +	bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> +				 SECTOR_SHIFT;  	bio->bi_private = &done;  	bio->bi_end_io = scrub_bio_wait_endio; -	raid56_parity_recover(bio, sector->recover->bioc, -			      sector->sblock->sectors[0]->mirror_num, false); +	raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);  	wait_for_completion_io(&done);  	return blk_status_to_errno(bio->bi_status); @@ -1395,17 +1582,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,  	int i;  	/* All sectors in sblock belong to the same stripe on the same device. */ -	ASSERT(first_sector->dev); -	if (!first_sector->dev->bdev) +	ASSERT(sblock->dev); +	if (!sblock->dev->bdev)  		goto out; -	bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); +	bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);  	for (i = 0; i < sblock->sector_count; i++) {  		struct scrub_sector *sector = sblock->sectors[i]; -		WARN_ON(!sector->page); -		bio_add_page(bio, sector->page, PAGE_SIZE, 0); +		bio_add_scrub_sector(bio, sector, fs_info->sectorsize);  	}  	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { @@ -1449,16 +1635,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,  		struct bio bio;  		struct bio_vec bvec; -		if (sector->dev->bdev == NULL) { +		if (sblock->dev->bdev == NULL) {  			sector->io_error = 1;  			sblock->no_io_error_seen = 0;  			continue;  		} -		WARN_ON(!sector->page); -		bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); -		bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); -		bio.bi_iter.bi_sector = sector->physical >> 9; +		bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); +		bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); +		bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> +					SECTOR_SHIFT;  		btrfsic_check_bio(&bio);  		if (submit_bio_wait(&bio)) { @@ -1475,7 +1661,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,  static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)  { -	struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; +	struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;  	int ret;  	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1521,30 +1707,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,  	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;  	const u32 sectorsize = fs_info->sectorsize; -	BUG_ON(sector_bad->page == NULL); -	BUG_ON(sector_good->page == NULL);  	if (force_write || sblock_bad->header_error ||  	    sblock_bad->checksum_error || sector_bad->io_error) {  		struct bio bio;  		struct bio_vec bvec;  		int ret; -		if (!sector_bad->dev->bdev) { +		if (!sblock_bad->dev->bdev) {  			btrfs_warn_rl(fs_info,  				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");  			return -EIO;  		} -		bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); -		bio.bi_iter.bi_sector = sector_bad->physical >> 9; -		__bio_add_page(&bio, sector_good->page, sectorsize, 0); +		bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); +		bio.bi_iter.bi_sector = (sblock_bad->physical + +					 sector_bad->offset) >> SECTOR_SHIFT; +		ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);  		btrfsic_check_bio(&bio);  		ret = submit_bio_wait(&bio);  		bio_uninit(&bio);  		if (ret) { -			btrfs_dev_stat_inc_and_print(sector_bad->dev, +			btrfs_dev_stat_inc_and_print(sblock_bad->dev,  				BTRFS_DEV_STAT_WRITE_ERRS);  			atomic64_inc(&fs_info->dev_replace.num_write_errors);  			return -EIO; @@ -1577,11 +1762,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)  static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)  { +	const u32 sectorsize = sblock->sctx->fs_info->sectorsize;  	struct scrub_sector *sector = sblock->sectors[sector_num]; -	BUG_ON(sector->page == NULL);  	if (sector->io_error) -		clear_page(page_address(sector->page)); +		memset(scrub_sector_get_kaddr(sector), 0, sectorsize);  	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);  } @@ -1608,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)  	return ret;  } +static void scrub_block_get(struct scrub_block *sblock) +{ +	refcount_inc(&sblock->refs); +} +  static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,  				      struct scrub_sector *sector)  { +	struct scrub_block *sblock = sector->sblock;  	struct scrub_bio *sbio;  	int ret;  	const u32 sectorsize = sctx->fs_info->sectorsize; @@ -1629,14 +1820,15 @@ again:  	}  	sbio = sctx->wr_curr_bio;  	if (sbio->sector_count == 0) { -		ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); +		ret = fill_writer_pointer_gap(sctx, sector->offset + +					      sblock->physical_for_dev_replace);  		if (ret) {  			mutex_unlock(&sctx->wr_lock);  			return ret;  		} -		sbio->physical = sector->physical_for_dev_replace; -		sbio->logical = sector->logical; +		sbio->physical = sblock->physical_for_dev_replace + sector->offset; +		sbio->logical = sblock->logical + sector->offset;  		sbio->dev = sctx->wr_tgtdev;  		if (!sbio->bio) {  			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, @@ -1647,14 +1839,14 @@ again:  		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;  		sbio->status = 0;  	} else if (sbio->physical + sbio->sector_count * sectorsize != -		   sector->physical_for_dev_replace || +		   sblock->physical_for_dev_replace + sector->offset ||  		   sbio->logical + sbio->sector_count * sectorsize != -		   sector->logical) { +		   sblock->logical + sector->offset) {  		scrub_wr_submit(sctx);  		goto again;  	} -	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); +	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);  	if (ret != sectorsize) {  		if (sbio->sector_count < 1) {  			bio_put(sbio->bio); @@ -1668,6 +1860,13 @@ again:  	sbio->sectors[sbio->sector_count] = sector;  	scrub_sector_get(sector); +	/* +	 * Since ssector no longer holds a page, but uses sblock::pages, we +	 * have to ensure the sblock had not been freed before our write bio +	 * finished. +	 */ +	scrub_block_get(sector->sblock); +  	sbio->sector_count++;  	if (sbio->sector_count == sctx->sectors_per_bio)  		scrub_wr_submit(sctx); @@ -1729,8 +1928,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work)  		}  	} -	for (i = 0; i < sbio->sector_count; i++) +	/* +	 * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in +	 * endio we should put the sblock. +	 */ +	for (i = 0; i < sbio->sector_count; i++) { +		scrub_block_put(sbio->sectors[i]->sblock);  		scrub_sector_put(sbio->sectors[i]); +	}  	bio_put(sbio->bio);  	kfree(sbio); @@ -1762,7 +1967,7 @@ static int scrub_checksum(struct scrub_block *sblock)  	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)  		ret = scrub_checksum_tree_block(sblock);  	else if (flags & BTRFS_EXTENT_FLAG_SUPER) -		(void)scrub_checksum_super(sblock); +		ret = scrub_checksum_super(sblock);  	else  		WARN_ON(1);  	if (ret) @@ -1785,15 +1990,11 @@ static int scrub_checksum_data(struct scrub_block *sblock)  	if (!sector->have_csum)  		return 0; -	kaddr = page_address(sector->page); +	kaddr = scrub_sector_get_kaddr(sector);  	shash->tfm = fs_info->csum_shash;  	crypto_shash_init(shash); -	/* -	 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector -	 * only contains one sector of data. -	 */  	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);  	if (memcmp(csum, sector->csum, fs_info->csum_size)) @@ -1826,7 +2027,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	ASSERT(sblock->sector_count == num_sectors);  	sector = sblock->sectors[0]; -	kaddr = page_address(sector->page); +	kaddr = scrub_sector_get_kaddr(sector);  	h = (struct btrfs_header *)kaddr;  	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1835,7 +2036,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	 * a) don't have an extent buffer and  	 * b) the page is already kmapped  	 */ -	if (sector->logical != btrfs_stack_header_bytenr(h)) +	if (sblock->logical != btrfs_stack_header_bytenr(h))  		sblock->header_error = 1;  	if (sector->generation != btrfs_stack_header_generation(h)) { @@ -1856,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  			    sectorsize - BTRFS_CSUM_SIZE);  	for (i = 1; i < num_sectors; i++) { -		kaddr = page_address(sblock->sectors[i]->page); +		kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);  		crypto_shash_update(shash, kaddr, sectorsize);  	} @@ -1881,10 +2082,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	BUG_ON(sblock->sector_count < 1);  	sector = sblock->sectors[0]; -	kaddr = page_address(sector->page); +	kaddr = scrub_sector_get_kaddr(sector);  	s = (struct btrfs_super_block *)kaddr; -	if (sector->logical != btrfs_super_bytenr(s)) +	if (sblock->logical != btrfs_super_bytenr(s))  		++fail_cor;  	if (sector->generation != btrfs_super_generation(s)) @@ -1901,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))  		++fail_cor; -	if (fail_cor + fail_gen) { -		/* -		 * if we find an error in a super block, we just report it. -		 * They will get written with the next transaction commit -		 * anyway -		 */ -		spin_lock(&sctx->stat_lock); -		++sctx->stat.super_errors; -		spin_unlock(&sctx->stat_lock); -		if (fail_cor) -			btrfs_dev_stat_inc_and_print(sector->dev, -				BTRFS_DEV_STAT_CORRUPTION_ERRS); -		else -			btrfs_dev_stat_inc_and_print(sector->dev, -				BTRFS_DEV_STAT_GENERATION_ERRS); -	} -  	return fail_cor + fail_gen;  } -static void scrub_block_get(struct scrub_block *sblock) -{ -	refcount_inc(&sblock->refs); -} -  static void scrub_block_put(struct scrub_block *sblock)  {  	if (refcount_dec_and_test(&sblock->refs)) { @@ -1936,6 +2115,12 @@ static void scrub_block_put(struct scrub_block *sblock)  		for (i = 0; i < sblock->sector_count; i++)  			scrub_sector_put(sblock->sectors[i]); +		for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { +			if (sblock->pages[i]) { +				detach_scrub_page_private(sblock->pages[i]); +				__free_page(sblock->pages[i]); +			} +		}  		kfree(sblock);  	}  } @@ -1947,11 +2132,8 @@ static void scrub_sector_get(struct scrub_sector *sector)  static void scrub_sector_put(struct scrub_sector *sector)  { -	if (atomic_dec_and_test(§or->refs)) { -		if (sector->page) -			__free_page(sector->page); +	if (atomic_dec_and_test(§or->refs))  		kfree(sector); -	}  }  /* @@ -2056,9 +2238,9 @@ again:  	}  	sbio = sctx->bios[sctx->curr];  	if (sbio->sector_count == 0) { -		sbio->physical = sector->physical; -		sbio->logical = sector->logical; -		sbio->dev = sector->dev; +		sbio->physical = sblock->physical + sector->offset; +		sbio->logical = sblock->logical + sector->offset; +		sbio->dev = sblock->dev;  		if (!sbio->bio) {  			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,  					      REQ_OP_READ, GFP_NOFS); @@ -2068,16 +2250,16 @@ again:  		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;  		sbio->status = 0;  	} else if (sbio->physical + sbio->sector_count * sectorsize != -		   sector->physical || +		   sblock->physical + sector->offset ||  		   sbio->logical + sbio->sector_count * sectorsize != -		   sector->logical || -		   sbio->dev != sector->dev) { +		   sblock->logical + sector->offset || +		   sbio->dev != sblock->dev) {  		scrub_submit(sctx);  		goto again;  	}  	sbio->sectors[sbio->sector_count] = sector; -	ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); +	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);  	if (ret != sectorsize) {  		if (sbio->sector_count < 1) {  			bio_put(sbio->bio); @@ -2102,6 +2284,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)  	struct scrub_block *sblock = bio->bi_private;  	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; +	btrfs_bio_counter_dec(fs_info);  	if (bio->bi_status)  		sblock->no_io_error_seen = 0; @@ -2118,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work)  	u64 logical;  	struct btrfs_device *dev; -	logical = sblock->sectors[0]->logical; -	dev = sblock->sectors[0]->dev; +	logical = sblock->logical; +	dev = sblock->dev;  	if (sblock->no_io_error_seen)  		scrub_recheck_block_checksum(sblock); @@ -2157,7 +2340,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)  	struct scrub_ctx *sctx = sblock->sctx;  	struct btrfs_fs_info *fs_info = sctx->fs_info;  	u64 length = sblock->sector_count << fs_info->sectorsize_bits; -	u64 logical = sblock->sectors[0]->logical; +	u64 logical = sblock->logical;  	struct btrfs_io_context *bioc = NULL;  	struct bio *bio;  	struct btrfs_raid_bio *rbio; @@ -2193,17 +2376,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)  	for (i = 0; i < sblock->sector_count; i++) {  		struct scrub_sector *sector = sblock->sectors[i]; -		/* -		 * For now, our scrub is still one page per sector, so pgoff -		 * is always 0. -		 */ -		raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); +		raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), +				       scrub_sector_get_page_offset(sector), +				       sector->offset + sector->sblock->logical);  	}  	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);  	scrub_block_get(sblock);  	scrub_pending_bio_inc(sctx);  	raid56_submit_missing_rbio(rbio); +	btrfs_put_bioc(bioc);  	return;  rbio_out: @@ -2225,7 +2407,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,  	const u32 sectorsize = sctx->fs_info->sectorsize;  	int index; -	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); +	sblock = alloc_scrub_block(sctx, dev, logical, physical, +				   physical_for_dev_replace, mirror_num);  	if (!sblock) {  		spin_lock(&sctx->stat_lock);  		sctx->stat.malloc_errors++; @@ -2233,12 +2416,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,  		return -ENOMEM;  	} -	/* one ref inside this function, plus one for each page added to -	 * a bio later on */ -	refcount_set(&sblock->refs, 1); -	sblock->sctx = sctx; -	sblock->no_io_error_seen = 1; -  	for (index = 0; len > 0; index++) {  		struct scrub_sector *sector;  		/* @@ -2248,36 +2425,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,  		 */  		u32 l = min(sectorsize, len); -		sector = kzalloc(sizeof(*sector), GFP_KERNEL); +		sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);  		if (!sector) { -leave_nomem:  			spin_lock(&sctx->stat_lock);  			sctx->stat.malloc_errors++;  			spin_unlock(&sctx->stat_lock);  			scrub_block_put(sblock);  			return -ENOMEM;  		} -		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); -		scrub_sector_get(sector); -		sblock->sectors[index] = sector; -		sector->sblock = sblock; -		sector->dev = dev;  		sector->flags = flags;  		sector->generation = gen; -		sector->logical = logical; -		sector->physical = physical; -		sector->physical_for_dev_replace = physical_for_dev_replace; -		sector->mirror_num = mirror_num;  		if (csum) {  			sector->have_csum = 1;  			memcpy(sector->csum, csum, sctx->fs_info->csum_size);  		} else {  			sector->have_csum = 0;  		} -		sblock->sector_count++; -		sector->page = alloc_page(GFP_KERNEL); -		if (!sector->page) -			goto leave_nomem;  		len -= l;  		logical += l;  		physical += l; @@ -2423,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock)  	}  	if (sblock->sparity && corrupted && !sblock->data_corrected) { -		u64 start = sblock->sectors[0]->logical; -		u64 end = sblock->sectors[sblock->sector_count - 1]->logical + +		u64 start = sblock->logical; +		u64 end = sblock->logical + +			  sblock->sectors[sblock->sector_count - 1]->offset +  			  sblock->sctx->fs_info->sectorsize;  		ASSERT(end - start <= U32_MAX); @@ -2508,11 +2672,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,  	u8 csum[BTRFS_CSUM_SIZE];  	u32 blocksize; +	/* +	 * Block size determines how many scrub_block will be allocated.  Here +	 * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't +	 * allocate too many scrub_block, while still won't cause too large +	 * bios for large extents. +	 */  	if (flags & BTRFS_EXTENT_FLAG_DATA) {  		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)  			blocksize = map->stripe_len;  		else -			blocksize = sctx->fs_info->sectorsize; +			blocksize = BTRFS_STRIPE_LEN;  		spin_lock(&sctx->stat_lock);  		sctx->stat.data_extents_scrubbed++;  		sctx->stat.data_bytes_scrubbed += len; @@ -2578,7 +2748,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,  	ASSERT(IS_ALIGNED(len, sectorsize)); -	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); +	sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);  	if (!sblock) {  		spin_lock(&sctx->stat_lock);  		sctx->stat.malloc_errors++; @@ -2586,51 +2756,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,  		return -ENOMEM;  	} -	/* one ref inside this function, plus one for each page added to -	 * a bio later on */ -	refcount_set(&sblock->refs, 1); -	sblock->sctx = sctx; -	sblock->no_io_error_seen = 1;  	sblock->sparity = sparity;  	scrub_parity_get(sparity);  	for (index = 0; len > 0; index++) {  		struct scrub_sector *sector; -		sector = kzalloc(sizeof(*sector), GFP_KERNEL); +		sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);  		if (!sector) { -leave_nomem:  			spin_lock(&sctx->stat_lock);  			sctx->stat.malloc_errors++;  			spin_unlock(&sctx->stat_lock);  			scrub_block_put(sblock);  			return -ENOMEM;  		} -		ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); -		/* For scrub block */ -		scrub_sector_get(sector);  		sblock->sectors[index] = sector;  		/* For scrub parity */  		scrub_sector_get(sector);  		list_add_tail(§or->list, &sparity->sectors_list); -		sector->sblock = sblock; -		sector->dev = dev;  		sector->flags = flags;  		sector->generation = gen; -		sector->logical = logical; -		sector->physical = physical; -		sector->mirror_num = mirror_num;  		if (csum) {  			sector->have_csum = 1;  			memcpy(sector->csum, csum, sctx->fs_info->csum_size);  		} else {  			sector->have_csum = 0;  		} -		sblock->sector_count++; -		sector->page = alloc_page(GFP_KERNEL); -		if (!sector->page) -			goto leave_nomem; -  		/* Iterate over the stripe range in sectorsize steps */  		len -= sectorsize; @@ -2774,6 +2925,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work)  						    work);  	struct scrub_ctx *sctx = sparity->sctx; +	btrfs_bio_counter_dec(sctx->fs_info);  	scrub_free_parity(sparity);  	scrub_pending_bio_dec(sctx);  } @@ -2824,6 +2976,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)  					      sparity->scrub_dev,  					      &sparity->dbitmap,  					      sparity->nsectors); +	btrfs_put_bioc(bioc);  	if (!rbio)  		goto rbio_out; @@ -2835,7 +2988,6 @@ rbio_out:  	bio_put(bio);  bioc_out:  	btrfs_bio_counter_dec(fs_info); -	btrfs_put_bioc(bioc);  	bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,  		  sparity->nsectors);  	spin_lock(&sctx->stat_lock); @@ -3077,7 +3229,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,  		ret = btrfs_lookup_csums_range(csum_root, extent_start,  					       extent_start + extent_size - 1, -					       &sctx->csum_list, 1); +					       &sctx->csum_list, 1, false);  		if (ret) {  			scrub_parity_mark_sectors_error(sparity, extent_start,  							extent_size); @@ -3266,7 +3418,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,  		}  		/* Block group removed? */  		spin_lock(&bg->lock); -		if (bg->removed) { +		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {  			spin_unlock(&bg->lock);  			ret = 0;  			break; @@ -3303,7 +3455,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,  		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {  			ret = btrfs_lookup_csums_range(csum_root, cur_logical,  					cur_logical + scrub_len - 1, -					&sctx->csum_list, 1); +					&sctx->csum_list, 1, false);  			if (ret)  				break;  		} @@ -3606,7 +3758,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,  		 * kthread or relocation.  		 */  		spin_lock(&bg->lock); -		if (!bg->removed) +		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))  			ret = -EINVAL;  		spin_unlock(&bg->lock); @@ -3764,13 +3916,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		}  		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { -			spin_lock(&cache->lock); -			if (!cache->to_copy) { +			if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {  				spin_unlock(&cache->lock);  				btrfs_put_block_group(cache);  				goto skip;  			} -			spin_unlock(&cache->lock);  		}  		/* @@ -3782,7 +3932,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		 * repair extents.  		 */  		spin_lock(&cache->lock); -		if (cache->removed) { +		if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {  			spin_unlock(&cache->lock);  			btrfs_put_block_group(cache);  			goto skip; @@ -3942,8 +4092,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,  		 * balance is triggered or it becomes used and unused again.  		 */  		spin_lock(&cache->lock); -		if (!cache->removed && !cache->ro && cache->reserved == 0 && -		    cache->used == 0) { +		if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) && +		    !cache->ro && cache->reserved == 0 && cache->used == 0) {  			spin_unlock(&cache->lock);  			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))  				btrfs_discard_queue_work(&fs_info->discard_ctl, @@ -4102,36 +4252,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	int ret;  	struct btrfs_device *dev;  	unsigned int nofs_flag; +	bool need_commit = false;  	if (btrfs_fs_closing(fs_info))  		return -EAGAIN; -	if (fs_info->nodesize > BTRFS_STRIPE_LEN) { -		/* -		 * in this case scrub is unable to calculate the checksum -		 * the way scrub is implemented. Do not handle this -		 * situation at all because it won't ever happen. -		 */ -		btrfs_err(fs_info, -			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", -		       fs_info->nodesize, -		       BTRFS_STRIPE_LEN); -		return -EINVAL; -	} +	/* At mount time we have ensured nodesize is in the range of [4K, 64K]. */ +	ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN); -	if (fs_info->nodesize > -	    SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || -	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { -		/* -		 * Would exhaust the array bounds of sectorv member in -		 * struct scrub_block -		 */ -		btrfs_err(fs_info, -"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", -		       fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, -		       fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); -		return -EINVAL; -	} +	/* +	 * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible +	 * value (max nodesize / min sectorsize), thus nodesize should always +	 * be fine. +	 */ +	ASSERT(fs_info->nodesize <= +	       SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);  	/* Allocate outside of device_list_mutex */  	sctx = scrub_setup_ctx(fs_info, is_dev_replace); @@ -4205,6 +4340,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	 */  	nofs_flag = memalloc_nofs_save();  	if (!is_dev_replace) { +		u64 old_super_errors; + +		spin_lock(&sctx->stat_lock); +		old_super_errors = sctx->stat.super_errors; +		spin_unlock(&sctx->stat_lock); +  		btrfs_info(fs_info, "scrub: started on devid %llu", devid);  		/*  		 * by holding device list mutex, we can @@ -4213,6 +4354,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  		mutex_lock(&fs_info->fs_devices->device_list_mutex);  		ret = scrub_supers(sctx, dev);  		mutex_unlock(&fs_info->fs_devices->device_list_mutex); + +		spin_lock(&sctx->stat_lock); +		/* +		 * Super block errors found, but we can not commit transaction +		 * at current context, since btrfs_commit_transaction() needs +		 * to pause the current running scrub (hold by ourselves). +		 */ +		if (sctx->stat.super_errors > old_super_errors && !sctx->readonly) +			need_commit = true; +		spin_unlock(&sctx->stat_lock);  	}  	if (!ret) @@ -4239,6 +4390,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,  	scrub_workers_put(fs_info);  	scrub_put_ctx(sctx); +	/* +	 * We found some super block errors before, now try to force a +	 * transaction commit, as scrub has finished. +	 */ +	if (need_commit) { +		struct btrfs_trans_handle *trans; + +		trans = btrfs_start_transaction(fs_info->tree_root, 0); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			btrfs_err(fs_info, +	"scrub: failed to start transaction to fix super block errors: %d", ret); +			return ret; +		} +		ret = btrfs_commit_transaction(trans); +		if (ret < 0) +			btrfs_err(fs_info, +	"scrub: failed to commit transaction to fix super block errors: %d", ret); +	}  	return ret;  out:  	scrub_workers_put(fs_info);  |