diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
| -rw-r--r-- | fs/btrfs/extent_io.c | 848 | 
1 files changed, 226 insertions, 622 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1adadd5d25d..a91d5ad27984 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -98,33 +98,16 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)   */  struct btrfs_bio_ctrl {  	struct btrfs_bio *bbio; -	int mirror_num;  	enum btrfs_compression_type compress_type;  	u32 len_to_oe_boundary;  	blk_opf_t opf;  	btrfs_bio_end_io_t end_io_func;  	struct writeback_control *wbc; - -	/* -	 * This is for metadata read, to provide the extra needed verification -	 * info.  This has to be provided for submit_one_bio(), as -	 * submit_one_bio() can submit a bio if it ends at stripe boundary.  If -	 * no such parent_check is provided, the metadata can hit false alert at -	 * endio time. -	 */ -	struct btrfs_tree_parent_check *parent_check; - -	/* -	 * Tell writepage not to lock the state bits for this range, it still -	 * does the unlocking. -	 */ -	bool extent_locked;  };  static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)  {  	struct btrfs_bio *bbio = bio_ctrl->bbio; -	int mirror_num = bio_ctrl->mirror_num;  	if (!bbio)  		return; @@ -132,25 +115,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)  	/* Caller should ensure the bio has at least some range added */  	ASSERT(bbio->bio.bi_iter.bi_size); -	if (!is_data_inode(&bbio->inode->vfs_inode)) { -		if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { -			/* -			 * For metadata read, we should have the parent_check, -			 * and copy it to bbio for metadata verification. -			 */ -			ASSERT(bio_ctrl->parent_check); -			memcpy(&bbio->parent_check, -			       bio_ctrl->parent_check, -			       sizeof(struct btrfs_tree_parent_check)); -		} -		bbio->bio.bi_opf |= REQ_META; -	} -  	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&  	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) -		btrfs_submit_compressed_read(bbio, mirror_num); +		btrfs_submit_compressed_read(bbio);  	else -		btrfs_submit_bio(bbio, mirror_num); +		btrfs_submit_bio(bbio, 0);  	/* The bbio is owned by the end_io handler now */  	bio_ctrl->bbio = NULL; @@ -248,8 +217,6 @@ static int process_one_page(struct btrfs_fs_info *fs_info,  	if (page_ops & PAGE_SET_ORDERED)  		btrfs_page_clamp_set_ordered(fs_info, page, start, len); -	if (page_ops & PAGE_SET_ERROR) -		btrfs_page_clamp_set_error(fs_info, page, start, len);  	if (page_ops & PAGE_START_WRITEBACK) {  		btrfs_page_clamp_clear_dirty(fs_info, page, start, len);  		btrfs_page_clamp_set_writeback(fs_info, page, start, len); @@ -295,9 +262,6 @@ static int __process_pages_contig(struct address_space *mapping,  		ASSERT(processed_end && *processed_end == start);  	} -	if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) -		mapping_set_error(mapping, -EIO); -  	folio_batch_init(&fbatch);  	while (index <= end_index) {  		int found_folios; @@ -506,6 +470,15 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,  			       start, end, page_ops, NULL);  } +static bool btrfs_verify_page(struct page *page, u64 start) +{ +	if (!fsverity_active(page->mapping->host) || +	    PageUptodate(page) || +	    start >= i_size_read(page->mapping->host)) +		return true; +	return fsverity_verify_page(page); +} +  static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -513,20 +486,10 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)  	ASSERT(page_offset(page) <= start &&  	       start + len <= page_offset(page) + PAGE_SIZE); -	if (uptodate) { -		if (fsverity_active(page->mapping->host) && -		    !PageError(page) && -		    !PageUptodate(page) && -		    start < i_size_read(page->mapping->host) && -		    !fsverity_verify_page(page)) { -			btrfs_page_set_error(fs_info, page, start, len); -		} else { -			btrfs_page_set_uptodate(fs_info, page, start, len); -		} -	} else { +	if (uptodate && btrfs_verify_page(page, start)) +		btrfs_page_set_uptodate(fs_info, page, start, len); +	else  		btrfs_page_clear_uptodate(fs_info, page, start, len); -		btrfs_page_set_error(fs_info, page, start, len); -	}  	if (!btrfs_is_subpage(fs_info, page))  		unlock_page(page); @@ -554,7 +517,6 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)  		len = end + 1 - start;  		btrfs_page_clear_uptodate(fs_info, page, start, len); -		btrfs_page_set_error(fs_info, page, start, len);  		ret = err < 0 ? err : -EIO;  		mapping_set_error(page->mapping, ret);  	} @@ -574,8 +536,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)  	struct bio *bio = &bbio->bio;  	int error = blk_status_to_errno(bio->bi_status);  	struct bio_vec *bvec; -	u64 start; -	u64 end;  	struct bvec_iter_all iter_all;  	ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -584,6 +544,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)  		struct inode *inode = page->mapping->host;  		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  		const u32 sectorsize = fs_info->sectorsize; +		u64 start = page_offset(page) + bvec->bv_offset; +		u32 len = bvec->bv_len;  		/* Our read/write should always be sector aligned. */  		if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) @@ -595,12 +557,12 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio)  		"incomplete page write with offset %u and length %u",  				   bvec->bv_offset, bvec->bv_len); -		start = page_offset(page) + bvec->bv_offset; -		end = start + bvec->bv_len - 1; - -		end_extent_writepage(page, error, start, end); - -		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); +		btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); +		if (error) { +			btrfs_page_clear_uptodate(fs_info, page, start, len); +			mapping_set_error(page->mapping, error); +		} +		btrfs_page_clear_writeback(fs_info, page, start, len);  	}  	bio_put(bio); @@ -686,35 +648,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)  }  /* - * Find extent buffer for a givne bytenr. - * - * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking - * in endio context. - */ -static struct extent_buffer *find_extent_buffer_readpage( -		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) -{ -	struct extent_buffer *eb; - -	/* -	 * For regular sectorsize, we can use page->private to grab extent -	 * buffer -	 */ -	if (fs_info->nodesize >= PAGE_SIZE) { -		ASSERT(PagePrivate(page) && page->private); -		return (struct extent_buffer *)page->private; -	} - -	/* For subpage case, we need to lookup buffer radix tree */ -	rcu_read_lock(); -	eb = radix_tree_lookup(&fs_info->buffer_radix, -			       bytenr >> fs_info->sectorsize_bits); -	rcu_read_unlock(); -	ASSERT(eb); -	return eb; -} - -/*   * after a readpage IO is done, we need to:   * clear the uptodate bits on error   * set the uptodate bits if things worked @@ -735,7 +668,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)  	 * larger than UINT_MAX, u32 here is enough.  	 */  	u32 bio_offset = 0; -	int mirror;  	struct bvec_iter_all iter_all;  	ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -775,11 +707,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)  		end = start + bvec->bv_len - 1;  		len = bvec->bv_len; -		mirror = bbio->mirror_num; -		if (uptodate && !is_data_inode(inode) && -		    btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) -			uptodate = false; -  		if (likely(uptodate)) {  			loff_t i_size = i_size_read(inode);  			pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -800,19 +727,12 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)  				zero_user_segment(page, zero_start,  						  offset_in_page(end) + 1);  			} -		} else if (!is_data_inode(inode)) { -			struct extent_buffer *eb; - -			eb = find_extent_buffer_readpage(fs_info, page, start); -			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); -			eb->read_mirror = mirror; -			atomic_dec(&eb->io_pages);  		}  		/* Update page status and unlock. */  		end_page_read(page, uptodate, start, len);  		endio_readpage_release_extent(&processed, BTRFS_I(inode), -					      start, end, PageUptodate(page)); +					      start, end, uptodate);  		ASSERT(bio_offset + len > bio_offset);  		bio_offset += len; @@ -906,13 +826,8 @@ static void alloc_new_bio(struct btrfs_inode *inode,  	bio_ctrl->bbio = bbio;  	bio_ctrl->len_to_oe_boundary = U32_MAX; -	/* -	 * Limit the extent to the ordered boundary for Zone Append. -	 * Compressed bios aren't submitted directly, so it doesn't apply to -	 * them. -	 */ -	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && -	    btrfs_use_zone_append(bbio)) { +	/* Limit data write bios to the ordered boundary. */ +	if (bio_ctrl->wbc) {  		struct btrfs_ordered_extent *ordered;  		ordered = btrfs_lookup_ordered_extent(inode, file_offset); @@ -920,11 +835,9 @@ static void alloc_new_bio(struct btrfs_inode *inode,  			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,  					ordered->file_offset +  					ordered->disk_num_bytes - file_offset); -			btrfs_put_ordered_extent(ordered); +			bbio->ordered = ordered;  		} -	} -	if (bio_ctrl->wbc) {  		/*  		 * Pick the last added device to support cgroup writeback.  For  		 * multi-device file systems this means blk-cgroup policies have @@ -1125,7 +1038,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,  	ret = set_page_extent_mapped(page);  	if (ret < 0) {  		unlock_extent(tree, start, end, NULL); -		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);  		unlock_page(page);  		return ret;  	} @@ -1329,11 +1241,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  		}  		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,  				delalloc_end, &page_started, &nr_written, wbc); -		if (ret) { -			btrfs_page_set_error(inode->root->fs_info, page, -					     page_offset(page), PAGE_SIZE); +		if (ret)  			return ret; -		} +  		/*  		 * delalloc_end is already one less than the total length, so  		 * we don't subtract one from PAGE_SIZE @@ -1438,7 +1348,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  	struct extent_map *em;  	int ret = 0;  	int nr = 0; -	bool compressed;  	ret = btrfs_writepage_cow_fixup(page);  	if (ret) { @@ -1448,12 +1357,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  		return 1;  	} -	/* -	 * we don't want to touch the inode after unlocking the page, -	 * so we update the mapping writeback index now -	 */ -	bio_ctrl->wbc->nr_to_write--; -  	bio_ctrl->end_io_func = end_bio_extent_writepage;  	while (cur <= end) {  		u64 disk_bytenr; @@ -1486,7 +1389,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  		em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);  		if (IS_ERR(em)) { -			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);  			ret = PTR_ERR_OR_ZERO(em);  			goto out_error;  		} @@ -1497,10 +1399,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  		ASSERT(cur < end);  		ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));  		ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); +  		block_start = em->block_start; -		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  		disk_bytenr = em->block_start + extent_offset; +		ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); +		ASSERT(block_start != EXTENT_MAP_HOLE); +		ASSERT(block_start != EXTENT_MAP_INLINE); +  		/*  		 * Note that em_end from extent_map_end() and dirty_range_end from  		 * find_next_dirty_byte() are all exclusive @@ -1509,22 +1415,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  		free_extent_map(em);  		em = NULL; -		/* -		 * compressed and inline extents are written through other -		 * paths in the FS -		 */ -		if (compressed || block_start == EXTENT_MAP_HOLE || -		    block_start == EXTENT_MAP_INLINE) { -			if (compressed) -				nr++; -			else -				btrfs_writepage_endio_finish_ordered(inode, -						page, cur, cur + iosize - 1, true); -			btrfs_page_clear_dirty(fs_info, page, cur, iosize); -			cur += iosize; -			continue; -		} -  		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);  		if (!PageWriteback(page)) {  			btrfs_err(inode->root->fs_info, @@ -1572,7 +1462,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl  {  	struct folio *folio = page_folio(page);  	struct inode *inode = page->mapping->host; -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	const u64 page_start = page_offset(page);  	const u64 page_end = page_start + PAGE_SIZE - 1;  	int ret; @@ -1585,9 +1474,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl  	WARN_ON(!PageLocked(page)); -	btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, -			       page_offset(page), PAGE_SIZE); -  	pg_offset = offset_in_page(i_size);  	if (page->index > end_index ||  	   (page->index == end_index && !pg_offset)) { @@ -1600,77 +1486,30 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl  		memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);  	ret = set_page_extent_mapped(page); -	if (ret < 0) { -		SetPageError(page); +	if (ret < 0)  		goto done; -	} -	if (!bio_ctrl->extent_locked) { -		ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); -		if (ret == 1) -			return 0; -		if (ret) -			goto done; -	} +	ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); +	if (ret == 1) +		return 0; +	if (ret) +		goto done;  	ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);  	if (ret == 1)  		return 0; +	bio_ctrl->wbc->nr_to_write--; +  done:  	if (nr == 0) {  		/* make sure the mapping tag for page dirty gets cleared */  		set_page_writeback(page);  		end_page_writeback(page);  	} -	/* -	 * Here we used to have a check for PageError() and then set @ret and -	 * call end_extent_writepage(). -	 * -	 * But in fact setting @ret here will cause different error paths -	 * between subpage and regular sectorsize. -	 * -	 * For regular page size, we never submit current page, but only add -	 * current page to current bio. -	 * The bio submission can only happen in next page. -	 * Thus if we hit the PageError() branch, @ret is already set to -	 * non-zero value and will not get updated for regular sectorsize. -	 * -	 * But for subpage case, it's possible we submit part of current page, -	 * thus can get PageError() set by submitted bio of the same page, -	 * while our @ret is still 0. -	 * -	 * So here we unify the behavior and don't set @ret. -	 * Error can still be properly passed to higher layer as page will -	 * be set error, here we just don't handle the IO failure. -	 * -	 * NOTE: This is just a hotfix for subpage. -	 * The root fix will be properly ending ordered extent when we hit -	 * an error during writeback. -	 * -	 * But that needs a bigger refactoring, as we not only need to grab the -	 * submitted OE, but also need to know exactly at which bytenr we hit -	 * the error. -	 * Currently the full page based __extent_writepage_io() is not -	 * capable of that. -	 */ -	if (PageError(page)) +	if (ret)  		end_extent_writepage(page, ret, page_start, page_end); -	if (bio_ctrl->extent_locked) { -		struct writeback_control *wbc = bio_ctrl->wbc; - -		/* -		 * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), -		 * the page can either be locked by lock_page() or -		 * process_one_page(). -		 * Let btrfs_page_unlock_writer() handle both cases. -		 */ -		ASSERT(wbc); -		btrfs_page_unlock_writer(fs_info, page, wbc->range_start, -					 wbc->range_end + 1 - wbc->range_start); -	} else { -		unlock_page(page); -	} +	unlock_page(page);  	ASSERT(ret <= 0);  	return ret;  } @@ -1681,52 +1520,26 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)  		       TASK_UNINTERRUPTIBLE);  } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ -	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); -	smp_mb__after_atomic(); -	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} -  /*   * Lock extent buffer status and pages for writeback.   * - * May try to flush write bio if we can't get the lock. - * - * Return  0 if the extent buffer doesn't need to be submitted. - *           (E.g. the extent buffer is not dirty) - * Return >0 is the extent buffer is submitted to bio. - * Return <0 if something went wrong, no page is locked. + * Return %false if the extent buffer doesn't need to be submitted (e.g. the + * extent buffer is not dirty) + * Return %true is the extent buffer is submitted to bio.   */ -static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, -			  struct btrfs_bio_ctrl *bio_ctrl) +static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, +			  struct writeback_control *wbc)  {  	struct btrfs_fs_info *fs_info = eb->fs_info; -	int i, num_pages; -	int flush = 0; -	int ret = 0; +	bool ret = false; -	if (!btrfs_try_tree_write_lock(eb)) { -		submit_write_bio(bio_ctrl, 0); -		flush = 1; -		btrfs_tree_lock(eb); -	} - -	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { +	btrfs_tree_lock(eb); +	while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {  		btrfs_tree_unlock(eb); -		if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) -			return 0; -		if (!flush) { -			submit_write_bio(bio_ctrl, 0); -			flush = 1; -		} -		while (1) { -			wait_on_extent_buffer_writeback(eb); -			btrfs_tree_lock(eb); -			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) -				break; -			btrfs_tree_unlock(eb); -		} +		if (wbc->sync_mode != WB_SYNC_ALL) +			return false; +		wait_on_extent_buffer_writeback(eb); +		btrfs_tree_lock(eb);  	}  	/* @@ -1742,45 +1555,19 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb  		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,  					 -eb->len,  					 fs_info->dirty_metadata_batch); -		ret = 1; +		ret = true;  	} else {  		spin_unlock(&eb->refs_lock);  	} -  	btrfs_tree_unlock(eb); - -	/* -	 * Either we don't need to submit any tree block, or we're submitting -	 * subpage eb. -	 * Subpage metadata doesn't use page locking at all, so we can skip -	 * the page locking. -	 */ -	if (!ret || fs_info->nodesize < PAGE_SIZE) -		return ret; - -	num_pages = num_extent_pages(eb); -	for (i = 0; i < num_pages; i++) { -		struct page *p = eb->pages[i]; - -		if (!trylock_page(p)) { -			if (!flush) { -				submit_write_bio(bio_ctrl, 0); -				flush = 1; -			} -			lock_page(p); -		} -	} -  	return ret;  } -static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) +static void set_btree_ioerr(struct extent_buffer *eb)  {  	struct btrfs_fs_info *fs_info = eb->fs_info; -	btrfs_page_set_error(fs_info, page, eb->start, eb->len); -	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) -		return; +	set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);  	/*  	 * A read may stumble upon this buffer later, make sure that it gets an @@ -1794,7 +1581,7 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)  	 * return a 0 because we are readonly if we don't modify the err seq for  	 * the superblock.  	 */ -	mapping_set_error(page->mapping, -EIO); +	mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);  	/*  	 * If writeback for a btree extent that doesn't belong to a log tree @@ -1869,101 +1656,34 @@ static struct extent_buffer *find_extent_buffer_nolock(  	return NULL;  } -/* - * The endio function for subpage extent buffer write. - * - * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() - * after all extent buffers in the page has finished their writeback. - */ -static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) +static void extent_buffer_write_end_io(struct btrfs_bio *bbio)  { -	struct bio *bio = &bbio->bio; -	struct btrfs_fs_info *fs_info; -	struct bio_vec *bvec; +	struct extent_buffer *eb = bbio->private; +	struct btrfs_fs_info *fs_info = eb->fs_info; +	bool uptodate = !bbio->bio.bi_status;  	struct bvec_iter_all iter_all; +	struct bio_vec *bvec; +	u32 bio_offset = 0; -	fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); -	ASSERT(fs_info->nodesize < PAGE_SIZE); +	if (!uptodate) +		set_btree_ioerr(eb); -	ASSERT(!bio_flagged(bio, BIO_CLONED)); -	bio_for_each_segment_all(bvec, bio, iter_all) { +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { +		u64 start = eb->start + bio_offset;  		struct page *page = bvec->bv_page; -		u64 bvec_start = page_offset(page) + bvec->bv_offset; -		u64 bvec_end = bvec_start + bvec->bv_len - 1; -		u64 cur_bytenr = bvec_start; - -		ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); - -		/* Iterate through all extent buffers in the range */ -		while (cur_bytenr <= bvec_end) { -			struct extent_buffer *eb; -			int done; - -			/* -			 * Here we can't use find_extent_buffer(), as it may -			 * try to lock eb->refs_lock, which is not safe in endio -			 * context. -			 */ -			eb = find_extent_buffer_nolock(fs_info, cur_bytenr); -			ASSERT(eb); - -			cur_bytenr = eb->start + eb->len; - -			ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); -			done = atomic_dec_and_test(&eb->io_pages); -			ASSERT(done); - -			if (bio->bi_status || -			    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { -				ClearPageUptodate(page); -				set_btree_ioerr(page, eb); -			} +		u32 len = bvec->bv_len; -			btrfs_subpage_clear_writeback(fs_info, page, eb->start, -						      eb->len); -			end_extent_buffer_writeback(eb); -			/* -			 * free_extent_buffer() will grab spinlock which is not -			 * safe in endio context. Thus here we manually dec -			 * the ref. -			 */ -			atomic_dec(&eb->refs); -		} +		if (!uptodate) +			btrfs_page_clear_uptodate(fs_info, page, start, len); +		btrfs_page_clear_writeback(fs_info, page, start, len); +		bio_offset += len;  	} -	bio_put(bio); -} -static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) -{ -	struct bio *bio = &bbio->bio; -	struct bio_vec *bvec; -	struct extent_buffer *eb; -	int done; -	struct bvec_iter_all iter_all; - -	ASSERT(!bio_flagged(bio, BIO_CLONED)); -	bio_for_each_segment_all(bvec, bio, iter_all) { -		struct page *page = bvec->bv_page; - -		eb = (struct extent_buffer *)page->private; -		BUG_ON(!eb); -		done = atomic_dec_and_test(&eb->io_pages); - -		if (bio->bi_status || -		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { -			ClearPageUptodate(page); -			set_btree_ioerr(page, eb); -		} - -		end_page_writeback(page); - -		if (!done) -			continue; - -		end_extent_buffer_writeback(eb); -	} +	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); +	smp_mb__after_atomic(); +	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -	bio_put(bio); +	bio_put(&bbio->bio);  }  static void prepare_eb_write(struct extent_buffer *eb) @@ -1973,7 +1693,6 @@ static void prepare_eb_write(struct extent_buffer *eb)  	unsigned long end;  	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); -	atomic_set(&eb->io_pages, num_extent_pages(eb));  	/* Set btree blocks beyond nritems with 0 to avoid stale content */  	nritems = btrfs_header_nritems(eb); @@ -1995,63 +1714,49 @@ static void prepare_eb_write(struct extent_buffer *eb)  	}  } -/* - * Unlike the work in write_one_eb(), we rely completely on extent locking. - * Page locking is only utilized at minimum to keep the VMM code happy. - */ -static void write_one_subpage_eb(struct extent_buffer *eb, -				 struct btrfs_bio_ctrl *bio_ctrl) -{ -	struct btrfs_fs_info *fs_info = eb->fs_info; -	struct page *page = eb->pages[0]; -	bool no_dirty_ebs = false; - -	prepare_eb_write(eb); - -	/* clear_page_dirty_for_io() in subpage helper needs page locked */ -	lock_page(page); -	btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); - -	/* Check if this is the last dirty bit to update nr_written */ -	no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, -							  eb->start, eb->len); -	if (no_dirty_ebs) -		clear_page_dirty_for_io(page); - -	bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - -	submit_extent_page(bio_ctrl, eb->start, page, eb->len, -			   eb->start - page_offset(page)); -	unlock_page(page); -	/* -	 * Submission finished without problem, if no range of the page is -	 * dirty anymore, we have submitted a page.  Update nr_written in wbc. -	 */ -	if (no_dirty_ebs) -		bio_ctrl->wbc->nr_to_write--; -} -  static noinline_for_stack void write_one_eb(struct extent_buffer *eb, -			struct btrfs_bio_ctrl *bio_ctrl) +					    struct writeback_control *wbc)  { -	u64 disk_bytenr = eb->start; -	int i, num_pages; +	struct btrfs_fs_info *fs_info = eb->fs_info; +	struct btrfs_bio *bbio;  	prepare_eb_write(eb); -	bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; - -	num_pages = num_extent_pages(eb); -	for (i = 0; i < num_pages; i++) { -		struct page *p = eb->pages[i]; - -		clear_page_dirty_for_io(p); -		set_page_writeback(p); -		submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); -		disk_bytenr += PAGE_SIZE; -		bio_ctrl->wbc->nr_to_write--; +	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, +			       REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), +			       eb->fs_info, extent_buffer_write_end_io, eb); +	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; +	bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); +	wbc_init_bio(wbc, &bbio->bio); +	bbio->inode = BTRFS_I(eb->fs_info->btree_inode); +	bbio->file_offset = eb->start; +	if (fs_info->nodesize < PAGE_SIZE) { +		struct page *p = eb->pages[0]; + +		lock_page(p); +		btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); +		if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, +						       eb->len)) { +			clear_page_dirty_for_io(p); +			wbc->nr_to_write--; +		} +		__bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); +		wbc_account_cgroup_owner(wbc, p, eb->len);  		unlock_page(p); +	} else { +		for (int i = 0; i < num_extent_pages(eb); i++) { +			struct page *p = eb->pages[i]; + +			lock_page(p); +			clear_page_dirty_for_io(p); +			set_page_writeback(p); +			__bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); +			wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); +			wbc->nr_to_write--; +			unlock_page(p); +		}  	} +	btrfs_submit_bio(bbio, 0);  }  /* @@ -2068,14 +1773,13 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,   * Return >=0 for the number of submitted extent buffers.   * Return <0 for fatal error.   */ -static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);  	int submitted = 0;  	u64 page_start = page_offset(page);  	int bit_start = 0;  	int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; -	int ret;  	/* Lock and write each dirty extent buffers in the range */  	while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { @@ -2121,25 +1825,13 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)  		if (!eb)  			continue; -		ret = lock_extent_buffer_for_io(eb, bio_ctrl); -		if (ret == 0) { -			free_extent_buffer(eb); -			continue; +		if (lock_extent_buffer_for_io(eb, wbc)) { +			write_one_eb(eb, wbc); +			submitted++;  		} -		if (ret < 0) { -			free_extent_buffer(eb); -			goto cleanup; -		} -		write_one_subpage_eb(eb, bio_ctrl);  		free_extent_buffer(eb); -		submitted++;  	}  	return submitted; - -cleanup: -	/* We hit error, end bio for the submitted extent buffers */ -	submit_write_bio(bio_ctrl, ret); -	return ret;  }  /* @@ -2162,7 +1854,7 @@ cleanup:   * previous call.   * Return <0 for fatal error.   */ -static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct writeback_control *wbc,  			  struct extent_buffer **eb_context)  {  	struct address_space *mapping = page->mapping; @@ -2174,7 +1866,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,  		return 0;  	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) -		return submit_eb_subpage(page, bio_ctrl); +		return submit_eb_subpage(page, wbc);  	spin_lock(&mapping->private_lock);  	if (!PagePrivate(page)) { @@ -2207,8 +1899,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,  		 * If for_sync, this hole will be filled with  		 * trasnsaction commit.  		 */ -		if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && -		    !bio_ctrl->wbc->for_sync) +		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)  			ret = -EAGAIN;  		else  			ret = 0; @@ -2218,13 +1909,12 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,  	*eb_context = eb; -	ret = lock_extent_buffer_for_io(eb, bio_ctrl); -	if (ret <= 0) { +	if (!lock_extent_buffer_for_io(eb, wbc)) {  		btrfs_revert_meta_write_pointer(cache, eb);  		if (cache)  			btrfs_put_block_group(cache);  		free_extent_buffer(eb); -		return ret; +		return 0;  	}  	if (cache) {  		/* @@ -2233,7 +1923,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,  		btrfs_schedule_zone_finish_bg(cache, eb);  		btrfs_put_block_group(cache);  	} -	write_one_eb(eb, bio_ctrl); +	write_one_eb(eb, wbc);  	free_extent_buffer(eb);  	return 1;  } @@ -2242,11 +1932,6 @@ int btree_write_cache_pages(struct address_space *mapping,  				   struct writeback_control *wbc)  {  	struct extent_buffer *eb_context = NULL; -	struct btrfs_bio_ctrl bio_ctrl = { -		.wbc = wbc, -		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), -		.extent_locked = 0, -	};  	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;  	int ret = 0;  	int done = 0; @@ -2288,7 +1973,7 @@ retry:  		for (i = 0; i < nr_folios; i++) {  			struct folio *folio = fbatch.folios[i]; -			ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); +			ret = submit_eb_page(&folio->page, wbc, &eb_context);  			if (ret == 0)  				continue;  			if (ret < 0) { @@ -2349,8 +2034,6 @@ retry:  		ret = 0;  	if (!ret && BTRFS_FS_ERROR(fs_info))  		ret = -EROFS; -	submit_write_bio(&bio_ctrl, ret); -  	btrfs_zoned_meta_io_unlock(fs_info);  	return ret;  } @@ -2520,38 +2203,31 @@ retry:   * already been ran (aka, ordered extent inserted) and all pages are still   * locked.   */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end) +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, +			      struct writeback_control *wbc)  {  	bool found_error = false;  	int first_error = 0;  	int ret = 0;  	struct address_space *mapping = inode->i_mapping; -	struct page *page; +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +	const u32 sectorsize = fs_info->sectorsize; +	loff_t i_size = i_size_read(inode);  	u64 cur = start; -	unsigned long nr_pages; -	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; -	struct writeback_control wbc_writepages = { -		.sync_mode	= WB_SYNC_ALL, -		.range_start	= start, -		.range_end	= end + 1, -		.no_cgroup_owner = 1, -	};  	struct btrfs_bio_ctrl bio_ctrl = { -		.wbc = &wbc_writepages, -		/* We're called from an async helper function */ -		.opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | -			wbc_to_write_flags(&wbc_writepages), -		.extent_locked = 1, +		.wbc = wbc, +		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),  	}; +	if (wbc->no_cgroup_owner) +		bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; +  	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); -	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> -		   PAGE_SHIFT; -	wbc_writepages.nr_to_write = nr_pages * 2; -	wbc_attach_fdatawrite_inode(&wbc_writepages, inode);  	while (cur <= end) {  		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); +		struct page *page; +		int nr = 0;  		page = find_get_page(mapping, cur >> PAGE_SHIFT);  		/* @@ -2562,19 +2238,31 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)  		ASSERT(PageLocked(page));  		ASSERT(PageDirty(page));  		clear_page_dirty_for_io(page); -		ret = __extent_writepage(page, &bio_ctrl); -		ASSERT(ret <= 0); + +		ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, +					    i_size, &nr); +		if (ret == 1) +			goto next_page; + +		/* Make sure the mapping tag for page dirty gets cleared. */ +		if (nr == 0) { +			set_page_writeback(page); +			end_page_writeback(page); +		} +		if (ret) +			end_extent_writepage(page, ret, cur, cur_end); +		btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur);  		if (ret < 0) {  			found_error = true;  			first_error = ret;  		} +next_page:  		put_page(page);  		cur = cur_end + 1;  	}  	submit_write_bio(&bio_ctrl, found_error ? ret : 0); -	wbc_detach_inode(&wbc_writepages);  	if (found_error)  		return first_error;  	return ret; @@ -2588,7 +2276,6 @@ int extent_writepages(struct address_space *mapping,  	struct btrfs_bio_ctrl bio_ctrl = {  		.wbc = wbc,  		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), -		.extent_locked = 0,  	};  	/* @@ -2679,8 +2366,7 @@ static int try_release_extent_state(struct extent_io_tree *tree,  		 * The delalloc new bit will be cleared by ordered extent  		 * completion.  		 */ -		ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, -					 mask, NULL); +		ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);  		/* if clear_extent_bit failed for enomem reasons,  		 * we can't allow the release to continue. @@ -3421,10 +3107,9 @@ static void __free_extent_buffer(struct extent_buffer *eb)  	kmem_cache_free(extent_buffer_cache, eb);  } -int extent_buffer_under_io(const struct extent_buffer *eb) +static int extent_buffer_under_io(const struct extent_buffer *eb)  { -	return (atomic_read(&eb->io_pages) || -		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || +	return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||  		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));  } @@ -3557,11 +3242,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,  	init_rwsem(&eb->lock);  	btrfs_leak_debug_add_eb(eb); -	INIT_LIST_HEAD(&eb->release_list);  	spin_lock_init(&eb->refs_lock);  	atomic_set(&eb->refs, 1); -	atomic_set(&eb->io_pages, 0);  	ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -3678,9 +3361,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)  	 * adequately protected by the refcount, but the TREE_REF bit and  	 * its corresponding reference are not. To protect against this  	 * class of races, we call check_buffer_tree_ref from the codepaths -	 * which trigger io after they set eb->io_pages. Note that once io is -	 * initiated, TREE_REF can no longer be cleared, so that is the -	 * moment at which any such race is best fixed. +	 * which trigger io. Note that once io is initiated, TREE_REF can no +	 * longer be cleared, so that is the moment at which any such race is +	 * best fixed.  	 */  	refs = atomic_read(&eb->refs);  	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -3939,7 +3622,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,  		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));  		eb->pages[i] = p; -		if (!PageUptodate(p)) +		if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))  			uptodate = 0;  		/* @@ -4142,13 +3825,12 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,  			continue;  		lock_page(page);  		btree_clear_page_dirty(page); -		ClearPageError(page);  		unlock_page(page);  	}  	WARN_ON(atomic_read(&eb->refs) == 0);  } -bool set_extent_buffer_dirty(struct extent_buffer *eb) +void set_extent_buffer_dirty(struct extent_buffer *eb)  {  	int i;  	int num_pages; @@ -4183,13 +3865,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)  					     eb->start, eb->len);  		if (subpage)  			unlock_page(eb->pages[0]); +		percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, +					 eb->len, +					 eb->fs_info->dirty_metadata_batch);  	}  #ifdef CONFIG_BTRFS_DEBUG  	for (i = 0; i < num_pages; i++)  		ASSERT(PageDirty(eb->pages[i]));  #endif - -	return was_dirty;  }  void clear_extent_buffer_uptodate(struct extent_buffer *eb) @@ -4242,84 +3925,54 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)  	}  } -static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, -				      int mirror_num, -				      struct btrfs_tree_parent_check *check) +static void extent_buffer_read_end_io(struct btrfs_bio *bbio)  { +	struct extent_buffer *eb = bbio->private;  	struct btrfs_fs_info *fs_info = eb->fs_info; -	struct extent_io_tree *io_tree; -	struct page *page = eb->pages[0]; -	struct extent_state *cached_state = NULL; -	struct btrfs_bio_ctrl bio_ctrl = { -		.opf = REQ_OP_READ, -		.mirror_num = mirror_num, -		.parent_check = check, -	}; -	int ret; +	bool uptodate = !bbio->bio.bi_status; +	struct bvec_iter_all iter_all; +	struct bio_vec *bvec; +	u32 bio_offset = 0; -	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); -	ASSERT(PagePrivate(page)); -	ASSERT(check); -	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; +	eb->read_mirror = bbio->mirror_num; -	if (wait == WAIT_NONE) { -		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, -				     &cached_state)) -			return -EAGAIN; -	} else { -		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, -				  &cached_state); -		if (ret < 0) -			return ret; -	} +	if (uptodate && +	    btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) +		uptodate = false; -	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || -	    PageUptodate(page) || -	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, -			      &cached_state); -		return 0; +	if (uptodate) { +		set_extent_buffer_uptodate(eb); +	} else { +		clear_extent_buffer_uptodate(eb); +		set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);  	} -	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); -	eb->read_mirror = 0; -	atomic_set(&eb->io_pages, 1); -	check_buffer_tree_ref(eb); -	bio_ctrl.end_io_func = end_bio_extent_readpage; +	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { +		u64 start = eb->start + bio_offset; +		struct page *page = bvec->bv_page; +		u32 len = bvec->bv_len; -	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); +		if (uptodate) +			btrfs_page_set_uptodate(fs_info, page, start, len); +		else +			btrfs_page_clear_uptodate(fs_info, page, start, len); -	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); -	submit_extent_page(&bio_ctrl, eb->start, page, eb->len, -			   eb->start - page_offset(page)); -	submit_one_bio(&bio_ctrl); -	if (wait != WAIT_COMPLETE) { -		free_extent_state(cached_state); -		return 0; +		bio_offset += len;  	} -	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, -			EXTENT_LOCKED, &cached_state); -	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) -		return -EIO; -	return 0; +	clear_bit(EXTENT_BUFFER_READING, &eb->bflags); +	smp_mb__after_atomic(); +	wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); +	free_extent_buffer(eb); + +	bio_put(&bbio->bio);  }  int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,  			     struct btrfs_tree_parent_check *check)  { -	int i; -	struct page *page; -	int locked_pages = 0; -	int all_uptodate = 1; -	int num_pages; -	unsigned long num_reads = 0; -	struct btrfs_bio_ctrl bio_ctrl = { -		.opf = REQ_OP_READ, -		.mirror_num = mirror_num, -		.parent_check = check, -	}; +	int num_pages = num_extent_pages(eb), i; +	struct btrfs_bio *bbio;  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  		return 0; @@ -4332,87 +3985,39 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,  	if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))  		return -EIO; -	if (eb->fs_info->nodesize < PAGE_SIZE) -		return read_extent_buffer_subpage(eb, wait, mirror_num, check); - -	num_pages = num_extent_pages(eb); -	for (i = 0; i < num_pages; i++) { -		page = eb->pages[i]; -		if (wait == WAIT_NONE) { -			/* -			 * WAIT_NONE is only utilized by readahead. If we can't -			 * acquire the lock atomically it means either the eb -			 * is being read out or under modification. -			 * Either way the eb will be or has been cached, -			 * readahead can exit safely. -			 */ -			if (!trylock_page(page)) -				goto unlock_exit; -		} else { -			lock_page(page); -		} -		locked_pages++; -	} -	/* -	 * We need to firstly lock all pages to make sure that -	 * the uptodate bit of our pages won't be affected by -	 * clear_extent_buffer_uptodate(). -	 */ -	for (i = 0; i < num_pages; i++) { -		page = eb->pages[i]; -		if (!PageUptodate(page)) { -			num_reads++; -			all_uptodate = 0; -		} -	} - -	if (all_uptodate) { -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -		goto unlock_exit; -	} +	/* Someone else is already reading the buffer, just wait for it. */ +	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) +		goto done;  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);  	eb->read_mirror = 0; -	atomic_set(&eb->io_pages, num_reads); -	/* -	 * It is possible for release_folio to clear the TREE_REF bit before we -	 * set io_pages. See check_buffer_tree_ref for a more detailed comment. -	 */  	check_buffer_tree_ref(eb); -	bio_ctrl.end_io_func = end_bio_extent_readpage; -	for (i = 0; i < num_pages; i++) { -		page = eb->pages[i]; - -		if (!PageUptodate(page)) { -			ClearPageError(page); -			submit_extent_page(&bio_ctrl, page_offset(page), page, -					   PAGE_SIZE, 0); -		} else { -			unlock_page(page); -		} +	atomic_inc(&eb->refs); + +	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, +			       REQ_OP_READ | REQ_META, eb->fs_info, +			       extent_buffer_read_end_io, eb); +	bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; +	bbio->inode = BTRFS_I(eb->fs_info->btree_inode); +	bbio->file_offset = eb->start; +	memcpy(&bbio->parent_check, check, sizeof(*check)); +	if (eb->fs_info->nodesize < PAGE_SIZE) { +		__bio_add_page(&bbio->bio, eb->pages[0], eb->len, +			       eb->start - page_offset(eb->pages[0])); +	} else { +		for (i = 0; i < num_pages; i++) +			__bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);  	} +	btrfs_submit_bio(bbio, mirror_num); -	submit_one_bio(&bio_ctrl); - -	if (wait != WAIT_COMPLETE) -		return 0; - -	for (i = 0; i < num_pages; i++) { -		page = eb->pages[i]; -		wait_on_page_locked(page); -		if (!PageUptodate(page)) +done: +	if (wait == WAIT_COMPLETE) { +		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); +		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))  			return -EIO;  	}  	return 0; - -unlock_exit: -	while (locked_pages > 0) { -		locked_pages--; -		page = eb->pages[locked_pages]; -		unlock_page(page); -	} -	return 0;  }  static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, @@ -4561,18 +4166,17 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,  	 * looked up.  We don't want to complain in this case, as the page was  	 * valid before, we just didn't write it out.  Instead we want to catch  	 * the case where we didn't actually read the block properly, which -	 * would have !PageUptodate && !PageError, as we clear PageError before -	 * reading. +	 * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.  	 */ -	if (fs_info->nodesize < PAGE_SIZE) { -		bool uptodate, error; +	if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) +		return; -		uptodate = btrfs_subpage_test_uptodate(fs_info, page, -						       eb->start, eb->len); -		error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); -		WARN_ON(!uptodate && !error); +	if (fs_info->nodesize < PAGE_SIZE) { +		if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, +							 eb->start, eb->len))) +			btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);  	} else { -		WARN_ON(!PageUptodate(page) && !PageError(page)); +		WARN_ON(!PageUptodate(page));  	}  }  |