diff options
Diffstat (limited to 'fs/btrfs/compression.c')
| -rw-r--r-- | fs/btrfs/compression.c | 685 | 
1 files changed, 407 insertions, 278 deletions
| diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7869ad12bc6e..32da97c3c19d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -9,6 +9,7 @@  #include <linux/fs.h>  #include <linux/pagemap.h>  #include <linux/highmem.h> +#include <linux/kthread.h>  #include <linux/time.h>  #include <linux/init.h>  #include <linux/string.h> @@ -28,6 +29,7 @@  #include "compression.h"  #include "extent_io.h"  #include "extent_map.h" +#include "subpage.h"  #include "zoned.h"  static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -172,16 +174,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,  		/* Hash through the page sector by sector */  		for (pg_offset = 0; pg_offset < bytes_left;  		     pg_offset += sectorsize) { -			kaddr = page_address(page); +			kaddr = kmap_atomic(page);  			crypto_shash_digest(shash, kaddr + pg_offset,  					    sectorsize, csum); +			kunmap_atomic(kaddr);  			if (memcmp(&csum, cb_sum, csum_size) != 0) {  				btrfs_print_data_csum_error(inode, disk_start,  						csum, cb_sum, cb->mirror_num); -				if (btrfs_io_bio(bio)->device) +				if (btrfs_bio(bio)->device)  					btrfs_dev_stat_inc_and_print( -						btrfs_io_bio(bio)->device, +						btrfs_bio(bio)->device,  						BTRFS_DEV_STAT_CORRUPTION_ERRS);  				return -EIO;  			} @@ -192,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,  	return 0;  } +/* + * Reduce bio and io accounting for a compressed_bio with its corresponding bio. + * + * Return true if there is no pending bio nor io. + * Return false otherwise. + */ +static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); +	unsigned int bi_size = 0; +	bool last_io = false; +	struct bio_vec *bvec; +	struct bvec_iter_all iter_all; + +	/* +	 * At endio time, bi_iter.bi_size doesn't represent the real bio size. +	 * Thus here we have to iterate through all segments to grab correct +	 * bio size. +	 */ +	bio_for_each_segment_all(bvec, bio, iter_all) +		bi_size += bvec->bv_len; + +	if (bio->bi_status) +		cb->errors = 1; + +	ASSERT(bi_size && bi_size <= cb->compressed_len); +	last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, +					&cb->pending_sectors); +	/* +	 * Here we must wake up the possible error handler after all other +	 * operations on @cb finished, or we can race with +	 * finish_compressed_bio_*() which may free @cb. +	 */ +	wake_up_var(cb); + +	return last_io; +} + +static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +{ +	unsigned int index; +	struct page *page; + +	/* Release the compressed pages */ +	for (index = 0; index < cb->nr_pages; index++) { +		page = cb->compressed_pages[index]; +		page->mapping = NULL; +		put_page(page); +	} + +	/* Do io completion on the original bio */ +	if (cb->errors) { +		bio_io_error(cb->orig_bio); +	} else { +		struct bio_vec *bvec; +		struct bvec_iter_all iter_all; + +		ASSERT(bio); +		ASSERT(!bio->bi_status); +		/* +		 * We have verified the checksum already, set page checked so +		 * the end_io handlers know about it +		 */ +		ASSERT(!bio_flagged(bio, BIO_CLONED)); +		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { +			u64 bvec_start = page_offset(bvec->bv_page) + +					 bvec->bv_offset; + +			btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), +					bvec->bv_page, bvec_start, +					bvec->bv_len); +		} + +		bio_endio(cb->orig_bio); +	} + +	/* Finally free the cb struct */ +	kfree(cb->compressed_pages); +	kfree(cb); +} +  /* when we finish reading compressed pages from the disk, we   * decompress them and then run the bio end_io routines on the   * decompressed pages (in the inode address space). @@ -206,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)  {  	struct compressed_bio *cb = bio->bi_private;  	struct inode *inode; -	struct page *page; -	unsigned int index; -	unsigned int mirror = btrfs_io_bio(bio)->mirror_num; +	unsigned int mirror = btrfs_bio(bio)->mirror_num;  	int ret = 0; -	if (bio->bi_status) -		cb->errors = 1; - -	/* if there are more bios still pending for this compressed -	 * extent, just exit -	 */ -	if (!refcount_dec_and_test(&cb->pending_bios)) +	if (!dec_and_test_compressed_bio(cb, bio))  		goto out;  	/*  	 * Record the correct mirror_num in cb->orig_bio so that  	 * read-repair can work properly.  	 */ -	btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; +	btrfs_bio(cb->orig_bio)->mirror_num = mirror;  	cb->mirror_num = mirror;  	/* @@ -248,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)  csum_failed:  	if (ret)  		cb->errors = 1; - -	/* release the compressed pages */ -	index = 0; -	for (index = 0; index < cb->nr_pages; index++) { -		page = cb->compressed_pages[index]; -		page->mapping = NULL; -		put_page(page); -	} - -	/* do io completion on the original bio */ -	if (cb->errors) { -		bio_io_error(cb->orig_bio); -	} else { -		struct bio_vec *bvec; -		struct bvec_iter_all iter_all; - -		/* -		 * we have verified the checksum already, set page -		 * checked so the end_io handlers know about it -		 */ -		ASSERT(!bio_flagged(bio, BIO_CLONED)); -		bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) -			SetPageChecked(bvec->bv_page); - -		bio_endio(cb->orig_bio); -	} - -	/* finally free the cb struct */ -	kfree(cb->compressed_pages); -	kfree(cb); +	finish_compressed_bio_read(cb, bio);  out:  	bio_put(bio);  } @@ -289,6 +336,7 @@ out:  static noinline void end_compressed_writeback(struct inode *inode,  					      const struct compressed_bio *cb)  { +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	unsigned long index = cb->start >> PAGE_SHIFT;  	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;  	struct page *pages[16]; @@ -311,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,  		for (i = 0; i < ret; i++) {  			if (cb->errors)  				SetPageError(pages[i]); -			end_page_writeback(pages[i]); +			btrfs_page_clamp_clear_writeback(fs_info, pages[i], +							 cb->start, cb->len);  			put_page(pages[i]);  		}  		nr_pages -= ret; @@ -320,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,  	/* the inode may be gone now */  } -/* - * do the cleanup once all the compressed pages hit the disk. - * This will clear writeback on the file pages and free the compressed - * pages. - * - * This also calls the writeback end hooks for the file pages so that - * metadata and checksums can be updated in the file. - */ -static void end_compressed_bio_write(struct bio *bio) +static void finish_compressed_bio_write(struct compressed_bio *cb)  { -	struct compressed_bio *cb = bio->bi_private; -	struct inode *inode; -	struct page *page; +	struct inode *inode = cb->inode;  	unsigned int index; -	if (bio->bi_status) -		cb->errors = 1; - -	/* if there are more bios still pending for this compressed -	 * extent, just exit -	 */ -	if (!refcount_dec_and_test(&cb->pending_bios)) -		goto out; - -	/* ok, we're the last bio for this extent, step one is to -	 * call back into the FS and do all the end_io operations +	/* +	 * Ok, we're the last bio for this extent, step one is to call back +	 * into the FS and do all the end_io operations.  	 */ -	inode = cb->inode; -	btrfs_record_physical_zoned(inode, cb->start, bio);  	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,  			cb->start, cb->start + cb->len - 1,  			!cb->errors);  	end_compressed_writeback(inode, cb); -	/* note, our inode could be gone now */ +	/* Note, our inode could be gone now */  	/* -	 * release the compressed pages, these came from alloc_page and +	 * Release the compressed pages, these came from alloc_page and  	 * are not attached to the inode at all  	 */ -	index = 0;  	for (index = 0; index < cb->nr_pages; index++) { -		page = cb->compressed_pages[index]; +		struct page *page = cb->compressed_pages[index]; +  		page->mapping = NULL;  		put_page(page);  	} -	/* finally free the cb struct */ +	/* Finally free the cb struct */  	kfree(cb->compressed_pages);  	kfree(cb); +} + +/* + * Do the cleanup once all the compressed pages hit the disk.  This will clear + * writeback on the file pages and free the compressed pages. + * + * This also calls the writeback end hooks for the file pages so that metadata + * and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio) +{ +	struct compressed_bio *cb = bio->bi_private; + +	if (!dec_and_test_compressed_bio(cb, bio)) +		goto out; + +	btrfs_record_physical_zoned(cb->inode, cb->start, bio); + +	finish_compressed_bio_write(cb);  out:  	bio_put(bio);  } +static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, +					  struct compressed_bio *cb, +					  struct bio *bio, int mirror_num) +{ +	blk_status_t ret; + +	ASSERT(bio->bi_iter.bi_size); +	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); +	if (ret) +		return ret; +	ret = btrfs_map_bio(fs_info, bio, mirror_num); +	return ret; +} + +/* + * Allocate a compressed_bio, which will be used to read/write on-disk + * (aka, compressed) * data. + * + * @cb:                 The compressed_bio structure, which records all the needed + *                      information to bind the compressed data to the uncompressed + *                      page cache. + * @disk_byten:         The logical bytenr where the compressed data will be read + *                      from or written to. + * @endio_func:         The endio function to call after the IO for compressed data + *                      is finished. + * @next_stripe_start:  Return value of logical bytenr of where next stripe starts. + *                      Let the caller know to only fill the bio up to the stripe + *                      boundary. + */ + + +static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, +					unsigned int opf, bio_end_io_t endio_func, +					u64 *next_stripe_start) +{ +	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); +	struct btrfs_io_geometry geom; +	struct extent_map *em; +	struct bio *bio; +	int ret; + +	bio = btrfs_bio_alloc(BIO_MAX_VECS); + +	bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; +	bio->bi_opf = opf; +	bio->bi_private = cb; +	bio->bi_end_io = endio_func; + +	em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); +	if (IS_ERR(em)) { +		bio_put(bio); +		return ERR_CAST(em); +	} + +	if (bio_op(bio) == REQ_OP_ZONE_APPEND) +		bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); + +	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); +	free_extent_map(em); +	if (ret < 0) { +		bio_put(bio); +		return ERR_PTR(ret); +	} +	*next_stripe_start = disk_bytenr + geom.len; + +	return bio; +} +  /*   * worker function to build and submit bios for previously compressed pages.   * The corresponding pages in the inode should be marked for writeback @@ -394,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,  	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	struct bio *bio = NULL;  	struct compressed_bio *cb; -	unsigned long bytes_left; -	int pg_index = 0; -	struct page *page; -	u64 first_byte = disk_start; +	u64 cur_disk_bytenr = disk_start; +	u64 next_stripe_start;  	blk_status_t ret;  	int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;  	const bool use_append = btrfs_use_zone_append(inode, disk_start);  	const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; -	WARN_ON(!PAGE_ALIGNED(start)); +	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && +	       IS_ALIGNED(len, fs_info->sectorsize));  	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);  	if (!cb)  		return BLK_STS_RESOURCE; -	refcount_set(&cb->pending_bios, 0); +	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);  	cb->errors = 0;  	cb->inode = &inode->vfs_inode;  	cb->start = start; @@ -418,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,  	cb->orig_bio = NULL;  	cb->nr_pages = nr_pages; -	bio = btrfs_bio_alloc(first_byte); -	bio->bi_opf = bio_op | write_flags; -	bio->bi_private = cb; -	bio->bi_end_io = end_compressed_bio_write; - -	if (use_append) { -		struct btrfs_device *device; - -		device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); -		if (IS_ERR(device)) { -			kfree(cb); -			bio_put(bio); -			return BLK_STS_NOTSUPP; +	while (cur_disk_bytenr < disk_start + compressed_len) { +		u64 offset = cur_disk_bytenr - disk_start; +		unsigned int index = offset >> PAGE_SHIFT; +		unsigned int real_size; +		unsigned int added; +		struct page *page = compressed_pages[index]; +		bool submit = false; + +		/* Allocate new bio if submitted or not yet allocated */ +		if (!bio) { +			bio = alloc_compressed_bio(cb, cur_disk_bytenr, +				bio_op | write_flags, end_compressed_bio_write, +				&next_stripe_start); +			if (IS_ERR(bio)) { +				ret = errno_to_blk_status(PTR_ERR(bio)); +				bio = NULL; +				goto finish_cb; +			}  		} - -		bio_set_dev(bio, device->bdev); -	} - -	if (blkcg_css) { -		bio->bi_opf |= REQ_CGROUP_PUNT; -		kthread_associate_blkcg(blkcg_css); -	} -	refcount_set(&cb->pending_bios, 1); - -	/* create and submit bios for the compressed pages */ -	bytes_left = compressed_len; -	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { -		int submit = 0; -		int len = 0; - -		page = compressed_pages[pg_index]; -		page->mapping = inode->vfs_inode.i_mapping; -		if (bio->bi_iter.bi_size) -			submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, -							  0); -  		/* -		 * Page can only be added to bio if the current bio fits in -		 * stripe. +		 * We should never reach next_stripe_start start as we will +		 * submit comp_bio when reach the boundary immediately.  		 */ -		if (!submit) { -			if (pg_index == 0 && use_append) -				len = bio_add_zone_append_page(bio, page, -							       PAGE_SIZE, 0); -			else -				len = bio_add_page(bio, page, PAGE_SIZE, 0); -		} - -		page->mapping = NULL; -		if (submit || len < PAGE_SIZE) { -			/* -			 * inc the count before we submit the bio so -			 * we know the end IO handler won't happen before -			 * we inc the count.  Otherwise, the cb might get -			 * freed before we're done setting it up -			 */ -			refcount_inc(&cb->pending_bios); -			ret = btrfs_bio_wq_end_io(fs_info, bio, -						  BTRFS_WQ_ENDIO_DATA); -			BUG_ON(ret); /* -ENOMEM */ +		ASSERT(cur_disk_bytenr != next_stripe_start); +		/* +		 * We have various limits on the real read size: +		 * - stripe boundary +		 * - page boundary +		 * - compressed length boundary +		 */ +		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); +		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); +		real_size = min_t(u64, real_size, compressed_len - offset); +		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + +		if (use_append) +			added = bio_add_zone_append_page(bio, page, real_size, +					offset_in_page(offset)); +		else +			added = bio_add_page(bio, page, real_size, +					offset_in_page(offset)); +		/* Reached zoned boundary */ +		if (added == 0) +			submit = true; + +		cur_disk_bytenr += added; +		/* Reached stripe boundary */ +		if (cur_disk_bytenr == next_stripe_start) +			submit = true; + +		/* Finished the range */ +		if (cur_disk_bytenr == disk_start + compressed_len) +			submit = true; + +		if (submit) {  			if (!skip_sum) {  				ret = btrfs_csum_one_bio(inode, bio, start, 1); -				BUG_ON(ret); /* -ENOMEM */ -			} - -			ret = btrfs_map_bio(fs_info, bio, 0); -			if (ret) { -				bio->bi_status = ret; -				bio_endio(bio); +				if (ret) +					goto finish_cb;  			} -			bio = btrfs_bio_alloc(first_byte); -			bio->bi_opf = bio_op | write_flags; -			bio->bi_private = cb; -			bio->bi_end_io = end_compressed_bio_write; -			if (blkcg_css) -				bio->bi_opf |= REQ_CGROUP_PUNT; -			/* -			 * Use bio_add_page() to ensure the bio has at least one -			 * page. -			 */ -			bio_add_page(bio, page, PAGE_SIZE, 0); +			ret = submit_compressed_bio(fs_info, cb, bio, 0); +			if (ret) +				goto finish_cb; +			bio = NULL;  		} -		if (bytes_left < PAGE_SIZE) { -			btrfs_info(fs_info, -					"bytes left %lu compress len %u nr %u", -			       bytes_left, cb->compressed_len, cb->nr_pages); -		} -		bytes_left -= PAGE_SIZE; -		first_byte += PAGE_SIZE;  		cond_resched();  	} +	if (blkcg_css) +		kthread_associate_blkcg(NULL); -	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); -	BUG_ON(ret); /* -ENOMEM */ - -	if (!skip_sum) { -		ret = btrfs_csum_one_bio(inode, bio, start, 1); -		BUG_ON(ret); /* -ENOMEM */ -	} +	return 0; -	ret = btrfs_map_bio(fs_info, bio, 0); -	if (ret) { +finish_cb: +	if (bio) {  		bio->bi_status = ret;  		bio_endio(bio);  	} +	/* Last byte of @cb is submitted, endio will free @cb */ +	if (cur_disk_bytenr == disk_start + compressed_len) +		return ret; -	if (blkcg_css) -		kthread_associate_blkcg(NULL); - -	return 0; +	wait_var_event(cb, refcount_read(&cb->pending_sectors) == +			   (disk_start + compressed_len - cur_disk_bytenr) >> +			   fs_info->sectorsize_bits); +	/* +	 * Even with previous bio ended, we should still have io not yet +	 * submitted, thus need to finish manually. +	 */ +	ASSERT(refcount_read(&cb->pending_sectors)); +	/* Now we are the only one referring @cb, can finish it safely. */ +	finish_compressed_bio_write(cb); +	return ret;  }  static u64 bio_end_offset(struct bio *bio) @@ -539,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)  	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;  } +/* + * Add extra pages in the same compressed file extent so that we don't need to + * re-read the same extent again and again. + * + * NOTE: this won't work well for subpage, as for subpage read, we lock the + * full page then submit bio for each compressed/regular extents. + * + * This means, if we have several sectors in the same page points to the same + * on-disk compressed data, we will re-read the same extent many times and + * this function can only help for the next page. + */  static noinline int add_ra_bio_pages(struct inode *inode,  				     u64 compressed_end,  				     struct compressed_bio *cb)  { +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	unsigned long end_index; -	unsigned long pg_index; -	u64 last_offset; +	u64 cur = bio_end_offset(cb->orig_bio);  	u64 isize = i_size_read(inode);  	int ret;  	struct page *page; -	unsigned long nr_pages = 0;  	struct extent_map *em;  	struct address_space *mapping = inode->i_mapping;  	struct extent_map_tree *em_tree;  	struct extent_io_tree *tree; -	u64 end; -	int misses = 0; +	int sectors_missed = 0; -	last_offset = bio_end_offset(cb->orig_bio);  	em_tree = &BTRFS_I(inode)->extent_tree;  	tree = &BTRFS_I(inode)->io_tree; @@ -576,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,  	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; -	while (last_offset < compressed_end) { -		pg_index = last_offset >> PAGE_SHIFT; +	while (cur < compressed_end) { +		u64 page_end; +		u64 pg_index = cur >> PAGE_SHIFT; +		u32 add_size;  		if (pg_index > end_index)  			break;  		page = xa_load(&mapping->i_pages, pg_index);  		if (page && !xa_is_value(page)) { -			misses++; -			if (misses > 4) +			sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> +					  fs_info->sectorsize_bits; + +			/* Beyond threshold, no need to continue */ +			if (sectors_missed > 4)  				break; -			goto next; + +			/* +			 * Jump to next page start as we already have page for +			 * current offset. +			 */ +			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; +			continue;  		}  		page = __page_cache_alloc(mapping_gfp_constraint(mapping, @@ -597,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,  		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {  			put_page(page); -			goto next; +			/* There is already a page, skip to page end */ +			cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; +			continue;  		} -		/* -		 * at this point, we have a locked page in the page cache -		 * for these bytes in the file.  But, we have to make -		 * sure they map to this compressed extent on disk. -		 */  		ret = set_page_extent_mapped(page);  		if (ret < 0) {  			unlock_page(page); @@ -612,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,  			break;  		} -		end = last_offset + PAGE_SIZE - 1; -		lock_extent(tree, last_offset, end); +		page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; +		lock_extent(tree, cur, page_end);  		read_lock(&em_tree->lock); -		em = lookup_extent_mapping(em_tree, last_offset, -					   PAGE_SIZE); +		em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);  		read_unlock(&em_tree->lock); -		if (!em || last_offset < em->start || -		    (last_offset + PAGE_SIZE > extent_map_end(em)) || +		/* +		 * At this point, we have a locked page in the page cache for +		 * these bytes in the file.  But, we have to make sure they map +		 * to this compressed extent on disk. +		 */ +		if (!em || cur < em->start || +		    (cur + fs_info->sectorsize > extent_map_end(em)) ||  		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {  			free_extent_map(em); -			unlock_extent(tree, last_offset, end); +			unlock_extent(tree, cur, page_end);  			unlock_page(page);  			put_page(page);  			break; @@ -641,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,  			}  		} -		ret = bio_add_page(cb->orig_bio, page, -				   PAGE_SIZE, 0); - -		if (ret == PAGE_SIZE) { -			nr_pages++; -			put_page(page); -		} else { -			unlock_extent(tree, last_offset, end); +		add_size = min(em->start + em->len, page_end + 1) - cur; +		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); +		if (ret != add_size) { +			unlock_extent(tree, cur, page_end);  			unlock_page(page);  			put_page(page);  			break;  		} -next: -		last_offset += PAGE_SIZE; +		/* +		 * If it's subpage, we also need to increase its +		 * subpage::readers number, as at endio we will decrease +		 * subpage::readers and to unlock the page. +		 */ +		if (fs_info->sectorsize < PAGE_SIZE) +			btrfs_subpage_start_reader(fs_info, page, cur, add_size); +		put_page(page); +		cur += add_size;  	}  	return 0;  } @@ -679,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	unsigned int compressed_len;  	unsigned int nr_pages;  	unsigned int pg_index; -	struct page *page; -	struct bio *comp_bio; -	u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; +	struct bio *comp_bio = NULL; +	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; +	u64 cur_disk_byte = disk_bytenr; +	u64 next_stripe_start;  	u64 file_offset;  	u64 em_len;  	u64 em_start; @@ -708,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	if (!cb)  		goto out; -	refcount_set(&cb->pending_bios, 0); +	refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);  	cb->errors = 0;  	cb->inode = inode;  	cb->mirror_num = mirror_num; @@ -748,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	/* include any pages we added in add_ra-bio_pages */  	cb->len = bio->bi_iter.bi_size; -	comp_bio = btrfs_bio_alloc(cur_disk_byte); -	comp_bio->bi_opf = REQ_OP_READ; -	comp_bio->bi_private = cb; -	comp_bio->bi_end_io = end_compressed_bio_read; -	refcount_set(&cb->pending_bios, 1); - -	for (pg_index = 0; pg_index < nr_pages; pg_index++) { -		u32 pg_len = PAGE_SIZE; -		int submit = 0; +	while (cur_disk_byte < disk_bytenr + compressed_len) { +		u64 offset = cur_disk_byte - disk_bytenr; +		unsigned int index = offset >> PAGE_SHIFT; +		unsigned int real_size; +		unsigned int added; +		struct page *page = cb->compressed_pages[index]; +		bool submit = false; + +		/* Allocate new bio if submitted or not yet allocated */ +		if (!comp_bio) { +			comp_bio = alloc_compressed_bio(cb, cur_disk_byte, +					REQ_OP_READ, end_compressed_bio_read, +					&next_stripe_start); +			if (IS_ERR(comp_bio)) { +				ret = errno_to_blk_status(PTR_ERR(comp_bio)); +				comp_bio = NULL; +				goto finish_cb; +			} +		} +		/* +		 * We should never reach next_stripe_start start as we will +		 * submit comp_bio when reach the boundary immediately. +		 */ +		ASSERT(cur_disk_byte != next_stripe_start); +		/* +		 * We have various limit on the real read size: +		 * - stripe boundary +		 * - page boundary +		 * - compressed length boundary +		 */ +		real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); +		real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); +		real_size = min_t(u64, real_size, compressed_len - offset); +		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); +		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));  		/* -		 * To handle subpage case, we need to make sure the bio only -		 * covers the range we need. -		 * -		 * If we're at the last page, truncate the length to only cover -		 * the remaining part. +		 * Maximum compressed extent is smaller than bio size limit, +		 * thus bio_add_page() should always success.  		 */ -		if (pg_index == nr_pages - 1) -			pg_len = min_t(u32, PAGE_SIZE, -					compressed_len - pg_index * PAGE_SIZE); +		ASSERT(added == real_size); +		cur_disk_byte += added; -		page = cb->compressed_pages[pg_index]; -		page->mapping = inode->i_mapping; -		page->index = em_start >> PAGE_SHIFT; +		/* Reached stripe boundary, need to submit */ +		if (cur_disk_byte == next_stripe_start) +			submit = true; -		if (comp_bio->bi_iter.bi_size) -			submit = btrfs_bio_fits_in_stripe(page, pg_len, -							  comp_bio, 0); +		/* Has finished the range, need to submit */ +		if (cur_disk_byte == disk_bytenr + compressed_len) +			submit = true; -		page->mapping = NULL; -		if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) { +		if (submit) {  			unsigned int nr_sectors; -			ret = btrfs_bio_wq_end_io(fs_info, comp_bio, -						  BTRFS_WQ_ENDIO_DATA); -			BUG_ON(ret); /* -ENOMEM */ - -			/* -			 * inc the count before we submit the bio so -			 * we know the end IO handler won't happen before -			 * we inc the count.  Otherwise, the cb might get -			 * freed before we're done setting it up -			 */ -			refcount_inc(&cb->pending_bios); -  			ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); -			BUG_ON(ret); /* -ENOMEM */ +			if (ret) +				goto finish_cb;  			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,  						  fs_info->sectorsize);  			sums += fs_info->csum_size * nr_sectors; -			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); -			if (ret) { -				comp_bio->bi_status = ret; -				bio_endio(comp_bio); -			} - -			comp_bio = btrfs_bio_alloc(cur_disk_byte); -			comp_bio->bi_opf = REQ_OP_READ; -			comp_bio->bi_private = cb; -			comp_bio->bi_end_io = end_compressed_bio_read; - -			bio_add_page(comp_bio, page, pg_len, 0); +			ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); +			if (ret) +				goto finish_cb; +			comp_bio = NULL;  		} -		cur_disk_byte += pg_len;  	} - -	ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); -	BUG_ON(ret); /* -ENOMEM */ - -	ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); -	BUG_ON(ret); /* -ENOMEM */ - -	ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); -	if (ret) { -		comp_bio->bi_status = ret; -		bio_endio(comp_bio); -	} -  	return 0;  fail2: @@ -842,6 +951,26 @@ fail1:  out:  	free_extent_map(em);  	return ret; +finish_cb: +	if (comp_bio) { +		comp_bio->bi_status = ret; +		bio_endio(comp_bio); +	} +	/* All bytes of @cb is submitted, endio will free @cb */ +	if (cur_disk_byte == disk_bytenr + compressed_len) +		return ret; + +	wait_var_event(cb, refcount_read(&cb->pending_sectors) == +			   (disk_bytenr + compressed_len - cur_disk_byte) >> +			   fs_info->sectorsize_bits); +	/* +	 * Even with previous bio ended, we should still have io not yet +	 * submitted, thus need to finish @cb manually. +	 */ +	ASSERT(refcount_read(&cb->pending_sectors)); +	/* Now we are the only one referring @cb, can finish it safely. */ +	finish_compressed_bio_read(cb, NULL); +	return ret;  }  /* |