diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 609 | 
1 files changed, 370 insertions, 239 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 57d070025c7a..dbbb67293e34 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,6 +70,7 @@  #include "verity.h"  #include "super.h"  #include "orphan.h" +#include "backref.h"  struct btrfs_iget_args {  	u64 ino; @@ -100,6 +101,18 @@ struct btrfs_rename_ctx {  	u64 index;  }; +/* + * Used by data_reloc_print_warning_inode() to pass needed info for filename + * resolution and output of error message. + */ +struct data_reloc_warn { +	struct btrfs_path path; +	struct btrfs_fs_info *fs_info; +	u64 extent_item_size; +	u64 logical; +	int mirror_num; +}; +  static const struct inode_operations btrfs_dir_inode_operations;  static const struct inode_operations btrfs_symlink_inode_operations;  static const struct inode_operations btrfs_special_inode_operations; @@ -122,12 +135,198 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,  				       u64 ram_bytes, int compress_type,  				       int type); +static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, +					  u64 root, void *warn_ctx) +{ +	struct data_reloc_warn *warn = warn_ctx; +	struct btrfs_fs_info *fs_info = warn->fs_info; +	struct extent_buffer *eb; +	struct btrfs_inode_item *inode_item; +	struct inode_fs_paths *ipath = NULL; +	struct btrfs_root *local_root; +	struct btrfs_key key; +	unsigned int nofs_flag; +	u32 nlink; +	int ret; + +	local_root = btrfs_get_fs_root(fs_info, root, true); +	if (IS_ERR(local_root)) { +		ret = PTR_ERR(local_root); +		goto err; +	} + +	/* This makes the path point to (inum INODE_ITEM ioff). */ +	key.objectid = inum; +	key.type = BTRFS_INODE_ITEM_KEY; +	key.offset = 0; + +	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); +	if (ret) { +		btrfs_put_root(local_root); +		btrfs_release_path(&warn->path); +		goto err; +	} + +	eb = warn->path.nodes[0]; +	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); +	nlink = btrfs_inode_nlink(eb, inode_item); +	btrfs_release_path(&warn->path); + +	nofs_flag = memalloc_nofs_save(); +	ipath = init_ipath(4096, local_root, &warn->path); +	memalloc_nofs_restore(nofs_flag); +	if (IS_ERR(ipath)) { +		btrfs_put_root(local_root); +		ret = PTR_ERR(ipath); +		ipath = NULL; +		/* +		 * -ENOMEM, not a critical error, just output an generic error +		 * without filename. +		 */ +		btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", +			   warn->logical, warn->mirror_num, root, inum, offset); +		return ret; +	} +	ret = paths_from_inode(inum, ipath); +	if (ret < 0) +		goto err; + +	/* +	 * We deliberately ignore the bit ipath might have been too small to +	 * hold all of the paths here +	 */ +	for (int i = 0; i < ipath->fspath->elem_cnt; i++) { +		btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", +			   warn->logical, warn->mirror_num, root, inum, offset, +			   fs_info->sectorsize, nlink, +			   (char *)(unsigned long)ipath->fspath->val[i]); +	} + +	btrfs_put_root(local_root); +	free_ipath(ipath); +	return 0; + +err: +	btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", +		   warn->logical, warn->mirror_num, root, inum, offset, ret); + +	free_ipath(ipath); +	return ret; +} + +/* + * Do extra user-friendly error output (e.g. lookup all the affected files). + * + * Return true if we succeeded doing the backref lookup. + * Return false if such lookup failed, and has to fallback to the old error message. + */ +static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, +				   const u8 *csum, const u8 *csum_expected, +				   int mirror_num) +{ +	struct btrfs_fs_info *fs_info = inode->root->fs_info; +	struct btrfs_path path = { 0 }; +	struct btrfs_key found_key = { 0 }; +	struct extent_buffer *eb; +	struct btrfs_extent_item *ei; +	const u32 csum_size = fs_info->csum_size; +	u64 logical; +	u64 flags; +	u32 item_size; +	int ret; + +	mutex_lock(&fs_info->reloc_mutex); +	logical = btrfs_get_reloc_bg_bytenr(fs_info); +	mutex_unlock(&fs_info->reloc_mutex); + +	if (logical == U64_MAX) { +		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); +		btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +			inode->root->root_key.objectid, btrfs_ino(inode), file_off, +			CSUM_FMT_VALUE(csum_size, csum), +			CSUM_FMT_VALUE(csum_size, csum_expected), +			mirror_num); +		return; +	} + +	logical += file_off; +	btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +			inode->root->root_key.objectid, +			btrfs_ino(inode), file_off, logical, +			CSUM_FMT_VALUE(csum_size, csum), +			CSUM_FMT_VALUE(csum_size, csum_expected), +			mirror_num); + +	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); +	if (ret < 0) { +		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", +			     logical, ret); +		return; +	} +	eb = path.nodes[0]; +	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); +	item_size = btrfs_item_size(eb, path.slots[0]); +	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +		unsigned long ptr = 0; +		u64 ref_root; +		u8 ref_level; + +		while (true) { +			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, +						      item_size, &ref_root, +						      &ref_level); +			if (ret < 0) { +				btrfs_warn_rl(fs_info, +				"failed to resolve tree backref for logical %llu: %d", +					      logical, ret); +				break; +			} +			if (ret > 0) +				break; + +			btrfs_warn_rl(fs_info, +"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", +				logical, mirror_num, +				(ref_level ? "node" : "leaf"), +				ref_level, ref_root); +		} +		btrfs_release_path(&path); +	} else { +		struct btrfs_backref_walk_ctx ctx = { 0 }; +		struct data_reloc_warn reloc_warn = { 0 }; + +		btrfs_release_path(&path); + +		ctx.bytenr = found_key.objectid; +		ctx.extent_item_pos = logical - found_key.objectid; +		ctx.fs_info = fs_info; + +		reloc_warn.logical = logical; +		reloc_warn.extent_item_size = found_key.offset; +		reloc_warn.mirror_num = mirror_num; +		reloc_warn.fs_info = fs_info; + +		iterate_extent_inodes(&ctx, true, +				      data_reloc_print_warning_inode, &reloc_warn); +	} +} +  static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,  		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)  {  	struct btrfs_root *root = inode->root;  	const u32 csum_size = root->fs_info->csum_size; +	/* For data reloc tree, it's better to do a backref lookup instead. */ +	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) +		return print_data_reloc_error(inode, logical_start, csum, +					      csum_expected, mirror_num); +  	/* Output without objectid, which is more meaningful */  	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {  		btrfs_warn_rl(root->fs_info, @@ -636,6 +835,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)  {  	struct btrfs_inode *inode = async_chunk->inode;  	struct btrfs_fs_info *fs_info = inode->root->fs_info; +	struct address_space *mapping = inode->vfs_inode.i_mapping;  	u64 blocksize = fs_info->sectorsize;  	u64 start = async_chunk->start;  	u64 end = async_chunk->end; @@ -750,7 +950,7 @@ again:  		/* Compression level is applied here and only here */  		ret = btrfs_compress_pages(  			compress_type | (fs_info->compress_level << 4), -					   inode->vfs_inode.i_mapping, start, +					   mapping, start,  					   pages,  					   &nr_pages,  					   &total_in, @@ -793,9 +993,9 @@ cont:  			unsigned long clear_flags = EXTENT_DELALLOC |  				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |  				EXTENT_DO_ACCOUNTING; -			unsigned long page_error_op; -			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; +			if (ret < 0) +				mapping_set_error(mapping, -EIO);  			/*  			 * inline extent creation worked or returned error, @@ -812,7 +1012,6 @@ cont:  						     clear_flags,  						     PAGE_UNLOCK |  						     PAGE_START_WRITEBACK | -						     page_error_op |  						     PAGE_END_WRITEBACK);  			/* @@ -934,6 +1133,12 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,  	unsigned long nr_written = 0;  	int page_started = 0;  	int ret; +	struct writeback_control wbc = { +		.sync_mode		= WB_SYNC_ALL, +		.range_start		= start, +		.range_end		= end, +		.no_cgroup_owner	= 1, +	};  	/*  	 * Call cow_file_range() to run the delalloc range directly, since we @@ -954,8 +1159,6 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,  			const u64 page_start = page_offset(locked_page);  			const u64 page_end = page_start + PAGE_SIZE - 1; -			btrfs_page_set_error(inode->root->fs_info, locked_page, -					     page_start, PAGE_SIZE);  			set_page_writeback(locked_page);  			end_page_writeback(locked_page);  			end_extent_writepage(locked_page, ret, page_start, page_end); @@ -965,7 +1168,10 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,  	}  	/* All pages will be unlocked, including @locked_page */ -	return extent_write_locked_range(&inode->vfs_inode, start, end); +	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); +	ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); +	wbc_detach_inode(&wbc); +	return ret;  }  static int submit_one_async_extent(struct btrfs_inode *inode, @@ -976,6 +1182,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,  	struct extent_io_tree *io_tree = &inode->io_tree;  	struct btrfs_root *root = inode->root;  	struct btrfs_fs_info *fs_info = root->fs_info; +	struct btrfs_ordered_extent *ordered;  	struct btrfs_key ins;  	struct page *locked_page = NULL;  	struct extent_map *em; @@ -1037,7 +1244,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,  	}  	free_extent_map(em); -	ret = btrfs_add_ordered_extent(inode, start,		/* file_offset */ +	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */  				       async_extent->ram_size,	/* num_bytes */  				       async_extent->ram_size,	/* ram_bytes */  				       ins.objectid,		/* disk_bytenr */ @@ -1045,8 +1252,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,  				       0,			/* offset */  				       1 << BTRFS_ORDERED_COMPRESSED,  				       async_extent->compress_type); -	if (ret) { +	if (IS_ERR(ordered)) {  		btrfs_drop_extent_map_range(inode, start, end, false); +		ret = PTR_ERR(ordered);  		goto out_free_reserve;  	}  	btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1055,11 +1263,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,  	extent_clear_unlock_delalloc(inode, start, end,  			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,  			PAGE_UNLOCK | PAGE_START_WRITEBACK); - -	btrfs_submit_compressed_write(inode, start,	/* file_offset */ -			    async_extent->ram_size,	/* num_bytes */ -			    ins.objectid,		/* disk_bytenr */ -			    ins.offset,			/* compressed_len */ +	btrfs_submit_compressed_write(ordered,  			    async_extent->pages,	/* compressed_pages */  			    async_extent->nr_pages,  			    async_chunk->write_flags, true); @@ -1074,12 +1278,13 @@ out_free_reserve:  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);  out_free: +	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);  	extent_clear_unlock_delalloc(inode, start, end,  				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |  				     EXTENT_DELALLOC_NEW |  				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,  				     PAGE_UNLOCK | PAGE_START_WRITEBACK | -				     PAGE_END_WRITEBACK | PAGE_SET_ERROR); +				     PAGE_END_WRITEBACK);  	free_async_extent_pages(async_extent);  	goto done;  } @@ -1287,6 +1492,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		min_alloc_size = fs_info->sectorsize;  	while (num_bytes > 0) { +		struct btrfs_ordered_extent *ordered; +  		cur_alloc_size = num_bytes;  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,  					   min_alloc_size, 0, alloc_hint, @@ -1311,16 +1518,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		}  		free_extent_map(em); -		ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, -					       ins.objectid, cur_alloc_size, 0, -					       1 << BTRFS_ORDERED_REGULAR, -					       BTRFS_COMPRESS_NONE); -		if (ret) +		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, +					ram_size, ins.objectid, cur_alloc_size, +					0, 1 << BTRFS_ORDERED_REGULAR, +					BTRFS_COMPRESS_NONE); +		if (IS_ERR(ordered)) { +			ret = PTR_ERR(ordered);  			goto out_drop_extent_cache; +		}  		if (btrfs_is_data_reloc_root(root)) { -			ret = btrfs_reloc_clone_csums(inode, start, -						      cur_alloc_size); +			ret = btrfs_reloc_clone_csums(ordered); +  			/*  			 * Only drop cache here, and process as normal.  			 * @@ -1337,6 +1546,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  							    start + ram_size - 1,  							    false);  		} +		btrfs_put_ordered_extent(ordered);  		btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1494,7 +1704,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)  	 * ->inode could be NULL if async_chunk_start has failed to compress,  	 * in which case we don't have anything to submit, yet we need to  	 * always adjust ->async_delalloc_pages as its paired with the init -	 * happening in cow_file_range_async +	 * happening in run_delalloc_compressed  	 */  	if (async_chunk->inode)  		submit_compressed_extents(async_chunk); @@ -1521,58 +1731,36 @@ static noinline void async_cow_free(struct btrfs_work *work)  		kvfree(async_cow);  } -static int cow_file_range_async(struct btrfs_inode *inode, -				struct writeback_control *wbc, -				struct page *locked_page, -				u64 start, u64 end, int *page_started, -				unsigned long *nr_written) +static bool run_delalloc_compressed(struct btrfs_inode *inode, +				    struct writeback_control *wbc, +				    struct page *locked_page, +				    u64 start, u64 end, int *page_started, +				    unsigned long *nr_written)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);  	struct async_cow *ctx;  	struct async_chunk *async_chunk;  	unsigned long nr_pages; -	u64 cur_end;  	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);  	int i; -	bool should_compress;  	unsigned nofs_flag;  	const blk_opf_t write_flags = wbc_to_write_flags(wbc); -	unlock_extent(&inode->io_tree, start, end, NULL); - -	if (inode->flags & BTRFS_INODE_NOCOMPRESS && -	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { -		num_chunks = 1; -		should_compress = false; -	} else { -		should_compress = true; -	} -  	nofs_flag = memalloc_nofs_save();  	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);  	memalloc_nofs_restore(nofs_flag); +	if (!ctx) +		return false; -	if (!ctx) { -		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | -			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | -			EXTENT_DO_ACCOUNTING; -		unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | -					 PAGE_END_WRITEBACK | PAGE_SET_ERROR; - -		extent_clear_unlock_delalloc(inode, start, end, locked_page, -					     clear_bits, page_ops); -		return -ENOMEM; -	} +	unlock_extent(&inode->io_tree, start, end, NULL); +	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);  	async_chunk = ctx->chunks;  	atomic_set(&ctx->num_chunks, num_chunks);  	for (i = 0; i < num_chunks; i++) { -		if (should_compress) -			cur_end = min(end, start + SZ_512K - 1); -		else -			cur_end = end; +		u64 cur_end = min(end, start + SZ_512K - 1);  		/*  		 * igrab is called higher up in the call chain, take only the @@ -1633,13 +1821,14 @@ static int cow_file_range_async(struct btrfs_inode *inode,  		start = cur_end + 1;  	}  	*page_started = 1; -	return 0; +	return true;  }  static noinline int run_delalloc_zoned(struct btrfs_inode *inode,  				       struct page *locked_page, u64 start,  				       u64 end, int *page_started, -				       unsigned long *nr_written) +				       unsigned long *nr_written, +				       struct writeback_control *wbc)  {  	u64 done_offset = end;  	int ret; @@ -1671,8 +1860,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,  			account_page_redirty(locked_page);  		}  		locked_page_done = true; -		extent_write_locked_range(&inode->vfs_inode, start, done_offset); - +		extent_write_locked_range(&inode->vfs_inode, start, done_offset, +					  wbc);  		start = done_offset + 1;  	} @@ -1864,7 +2053,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),  				    key->offset - args->extent_offset, -				    args->disk_bytenr, false, path); +				    args->disk_bytenr, args->strict, path);  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);  	if (ret != 0)  		goto out; @@ -1947,6 +2136,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,  	nocow_args.writeback_path = true;  	while (1) { +		struct btrfs_ordered_extent *ordered;  		struct btrfs_key found_key;  		struct btrfs_file_extent_item *fi;  		struct extent_buffer *leaf; @@ -1954,6 +2144,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,  		u64 ram_bytes;  		u64 nocow_end;  		int extent_type; +		bool is_prealloc;  		nocow = false; @@ -2092,8 +2283,8 @@ out_check:  		}  		nocow_end = cur_offset + nocow_args.num_bytes - 1; - -		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { +		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; +		if (is_prealloc) {  			u64 orig_start = found_key.offset - nocow_args.extent_offset;  			struct extent_map *em; @@ -2109,29 +2300,22 @@ out_check:  				goto error;  			}  			free_extent_map(em); -			ret = btrfs_add_ordered_extent(inode, -					cur_offset, nocow_args.num_bytes, -					nocow_args.num_bytes, -					nocow_args.disk_bytenr, -					nocow_args.num_bytes, 0, -					1 << BTRFS_ORDERED_PREALLOC, -					BTRFS_COMPRESS_NONE); -			if (ret) { +		} + +		ordered = btrfs_alloc_ordered_extent(inode, cur_offset, +				nocow_args.num_bytes, nocow_args.num_bytes, +				nocow_args.disk_bytenr, nocow_args.num_bytes, 0, +				is_prealloc +				? (1 << BTRFS_ORDERED_PREALLOC) +				: (1 << BTRFS_ORDERED_NOCOW), +				BTRFS_COMPRESS_NONE); +		if (IS_ERR(ordered)) { +			if (is_prealloc) {  				btrfs_drop_extent_map_range(inode, cur_offset,  							    nocow_end, false); -				goto error;  			} -		} else { -			ret = btrfs_add_ordered_extent(inode, cur_offset, -						       nocow_args.num_bytes, -						       nocow_args.num_bytes, -						       nocow_args.disk_bytenr, -						       nocow_args.num_bytes, -						       0, -						       1 << BTRFS_ORDERED_NOCOW, -						       BTRFS_COMPRESS_NONE); -			if (ret) -				goto error; +			ret = PTR_ERR(ordered); +			goto error;  		}  		if (nocow) { @@ -2145,8 +2329,8 @@ out_check:  			 * extent_clear_unlock_delalloc() in error handler  			 * from freeing metadata of created ordered extent.  			 */ -			ret = btrfs_reloc_clone_csums(inode, cur_offset, -						      nocow_args.num_bytes); +			ret = btrfs_reloc_clone_csums(ordered); +		btrfs_put_ordered_extent(ordered);  		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,  					     locked_page, EXTENT_LOCKED | @@ -2214,7 +2398,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page  		u64 start, u64 end, int *page_started, unsigned long *nr_written,  		struct writeback_control *wbc)  { -	int ret; +	int ret = 0;  	const bool zoned = btrfs_is_zoned(inode->root->fs_info);  	/* @@ -2235,19 +2419,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page  		ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));  		ret = run_delalloc_nocow(inode, locked_page, start, end,  					 page_started, nr_written); -	} else if (!btrfs_inode_can_compress(inode) || -		   !inode_need_compress(inode, start, end)) { -		if (zoned) -			ret = run_delalloc_zoned(inode, locked_page, start, end, -						 page_started, nr_written); -		else -			ret = cow_file_range(inode, locked_page, start, end, -					     page_started, nr_written, 1, NULL); -	} else { -		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); -		ret = cow_file_range_async(inode, wbc, locked_page, start, end, -					   page_started, nr_written); +		goto out;  	} + +	if (btrfs_inode_can_compress(inode) && +	    inode_need_compress(inode, start, end) && +	    run_delalloc_compressed(inode, wbc, locked_page, start, +				    end, page_started, nr_written)) +		goto out; + +	if (zoned) +		ret = run_delalloc_zoned(inode, locked_page, start, end, +					 page_started, nr_written, wbc); +	else +		ret = cow_file_range(inode, locked_page, start, end, +				     page_started, nr_written, 1, NULL); + +out:  	ASSERT(ret <= 0);  	if (ret)  		btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -2515,125 +2703,42 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,  	}  } -/* - * Split off the first pre bytes from the extent_map at [start, start + len] - * - * This function is intended to be used only for extract_ordered_extent(). - */ -static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) -{ -	struct extent_map_tree *em_tree = &inode->extent_tree; -	struct extent_map *em; -	struct extent_map *split_pre = NULL; -	struct extent_map *split_mid = NULL; -	int ret = 0; -	unsigned long flags; - -	ASSERT(pre != 0); -	ASSERT(pre < len); - -	split_pre = alloc_extent_map(); -	if (!split_pre) -		return -ENOMEM; -	split_mid = alloc_extent_map(); -	if (!split_mid) { -		ret = -ENOMEM; -		goto out_free_pre; -	} - -	lock_extent(&inode->io_tree, start, start + len - 1, NULL); -	write_lock(&em_tree->lock); -	em = lookup_extent_mapping(em_tree, start, len); -	if (!em) { -		ret = -EIO; -		goto out_unlock; -	} - -	ASSERT(em->len == len); -	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); -	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); -	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); -	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); -	ASSERT(!list_empty(&em->list)); - -	flags = em->flags; -	clear_bit(EXTENT_FLAG_PINNED, &em->flags); - -	/* First, replace the em with a new extent_map starting from * em->start */ -	split_pre->start = em->start; -	split_pre->len = pre; -	split_pre->orig_start = split_pre->start; -	split_pre->block_start = em->block_start; -	split_pre->block_len = split_pre->len; -	split_pre->orig_block_len = split_pre->block_len; -	split_pre->ram_bytes = split_pre->len; -	split_pre->flags = flags; -	split_pre->compress_type = em->compress_type; -	split_pre->generation = em->generation; - -	replace_extent_mapping(em_tree, em, split_pre, 1); - -	/* -	 * Now we only have an extent_map at: -	 *     [em->start, em->start + pre] -	 */ - -	/* Insert the middle extent_map. */ -	split_mid->start = em->start + pre; -	split_mid->len = em->len - pre; -	split_mid->orig_start = split_mid->start; -	split_mid->block_start = em->block_start + pre; -	split_mid->block_len = split_mid->len; -	split_mid->orig_block_len = split_mid->block_len; -	split_mid->ram_bytes = split_mid->len; -	split_mid->flags = flags; -	split_mid->compress_type = em->compress_type; -	split_mid->generation = em->generation; -	add_extent_mapping(em_tree, split_mid, 1); - -	/* Once for us */ -	free_extent_map(em); -	/* Once for the tree */ -	free_extent_map(em); - -out_unlock: -	write_unlock(&em_tree->lock); -	unlock_extent(&inode->io_tree, start, start + len - 1, NULL); -	free_extent_map(split_mid); -out_free_pre: -	free_extent_map(split_pre); -	return ret; -} - -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, -				 struct btrfs_ordered_extent *ordered) +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, +					struct btrfs_ordered_extent *ordered)  {  	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;  	u64 len = bbio->bio.bi_iter.bi_size; -	struct btrfs_inode *inode = bbio->inode; -	u64 ordered_len = ordered->num_bytes; -	int ret = 0; +	struct btrfs_ordered_extent *new; +	int ret;  	/* Must always be called for the beginning of an ordered extent. */  	if (WARN_ON_ONCE(start != ordered->disk_bytenr))  		return -EINVAL;  	/* No need to split if the ordered extent covers the entire bio. */ -	if (ordered->disk_num_bytes == len) +	if (ordered->disk_num_bytes == len) { +		refcount_inc(&ordered->refs); +		bbio->ordered = ordered;  		return 0; - -	ret = btrfs_split_ordered_extent(ordered, len); -	if (ret) -		return ret; +	}  	/*  	 * Don't split the extent_map for NOCOW extents, as we're writing into  	 * a pre-existing one.  	 */ -	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) -		return 0; +	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { +		ret = split_extent_map(bbio->inode, bbio->file_offset, +				       ordered->num_bytes, len, +				       ordered->disk_bytenr); +		if (ret) +			return ret; +	} -	return split_extent_map(inode, bbio->file_offset, ordered_len, len); +	new = btrfs_split_ordered_extent(ordered, len); +	if (IS_ERR(new)) +		return PTR_ERR(new); +	bbio->ordered = new; +	return 0;  }  /* @@ -2651,7 +2756,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,  		trans->adding_csums = true;  		if (!csum_root)  			csum_root = btrfs_csum_root(trans->fs_info, -						    sum->bytenr); +						    sum->logical);  		ret = btrfs_csum_file_blocks(trans, csum_root, sum);  		trans->adding_csums = false;  		if (ret) @@ -2689,8 +2794,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,  		ret = set_extent_bit(&inode->io_tree, search_start,  				     search_start + em_len - 1, -				     EXTENT_DELALLOC_NEW, cached_state, -				     GFP_NOFS); +				     EXTENT_DELALLOC_NEW, cached_state);  next:  		search_start = extent_map_end(em);  		free_extent_map(em); @@ -2723,8 +2827,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,  			return ret;  	} -	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, -				   cached_state); +	return set_extent_bit(&inode->io_tree, start, end, +			      EXTENT_DELALLOC | extra_bits, cached_state);  }  /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2847,7 +2951,6 @@ out_page:  		mapping_set_error(page->mapping, ret);  		end_extent_writepage(page, ret, page_start, page_end);  		clear_page_dirty_for_io(page); -		SetPageError(page);  	}  	btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);  	unlock_page(page); @@ -3068,7 +3171,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,   * an ordered extent if the range of bytes in the file it covers are   * fully written.   */ -int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)  {  	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);  	struct btrfs_root *root = inode->root; @@ -3103,12 +3206,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  		goto out;  	} -	/* A valid ->physical implies a write on a sequential zone. */ -	if (ordered_extent->physical != (u64)-1) { -		btrfs_rewrite_logical_zoned(ordered_extent); +	if (btrfs_is_zoned(fs_info))  		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,  					ordered_extent->disk_num_bytes); -	}  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {  		truncated = true; @@ -3276,6 +3376,14 @@ out:  	return ret;  } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) +{ +	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && +	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) +		btrfs_finish_ordered_zoned(ordered); +	return btrfs_finish_one_ordered(ordered); +} +  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,  					  struct page *page, u64 start,  					  u64 end, bool uptodate) @@ -4223,7 +4331,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)  	}  	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), -			0); +				false);  	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),  				 &fname.disk_name); @@ -4798,7 +4906,7 @@ again:  	if (only_release_metadata)  		set_extent_bit(&inode->io_tree, block_start, block_end, -			       EXTENT_NORESERVE, NULL, GFP_NOFS); +			       EXTENT_NORESERVE, NULL);  out_unlock:  	if (ret) { @@ -7261,7 +7369,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,  static int btrfs_get_blocks_direct_write(struct extent_map **map,  					 struct inode *inode,  					 struct btrfs_dio_data *dio_data, -					 u64 start, u64 len, +					 u64 start, u64 *lenp,  					 unsigned int iomap_flags)  {  	const bool nowait = (iomap_flags & IOMAP_NOWAIT); @@ -7272,6 +7380,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,  	struct btrfs_block_group *bg;  	bool can_nocow = false;  	bool space_reserved = false; +	u64 len = *lenp;  	u64 prev_len;  	int ret = 0; @@ -7342,15 +7451,19 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,  		free_extent_map(em);  		*map = NULL; -		if (nowait) -			return -EAGAIN; +		if (nowait) { +			ret = -EAGAIN; +			goto out; +		}  		/*  		 * If we could not allocate data space before locking the file  		 * range and we can't do a NOCOW write, then we have to fail.  		 */ -		if (!dio_data->data_space_reserved) -			return -ENOSPC; +		if (!dio_data->data_space_reserved) { +			ret = -ENOSPC; +			goto out; +		}  		/*  		 * We have to COW and we have already reserved data space before, @@ -7391,6 +7504,7 @@ out:  		btrfs_delalloc_release_extents(BTRFS_I(inode), len);  		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);  	} +	*lenp = len;  	return ret;  } @@ -7567,7 +7681,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,  	if (write) {  		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, -						    start, len, flags); +						    start, &len, flags);  		if (ret < 0)  			goto unlock_err;  		unlock_extents = true; @@ -7661,8 +7775,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,  		pos += submitted;  		length -= submitted;  		if (write) -			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, -						       pos, length, false); +			btrfs_finish_ordered_extent(dio_data->ordered, NULL, +						    pos, length, false);  		else  			unlock_extent(&BTRFS_I(inode)->io_tree, pos,  				      pos + length - 1, NULL); @@ -7692,12 +7806,14 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)  			   dip->file_offset, dip->bytes, bio->bi_status);  	} -	if (btrfs_op(bio) == BTRFS_MAP_WRITE) -		btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, -					       dip->bytes, !bio->bi_status); -	else +	if (btrfs_op(bio) == BTRFS_MAP_WRITE) { +		btrfs_finish_ordered_extent(bbio->ordered, NULL, +					    dip->file_offset, dip->bytes, +					    !bio->bi_status); +	} else {  		unlock_extent(&inode->io_tree, dip->file_offset,  			      dip->file_offset + dip->bytes - 1, NULL); +	}  	bbio->bio.bi_private = bbio->private;  	iomap_dio_bio_end_io(bio); @@ -7733,7 +7849,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,  		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);  		if (ret) { -			btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); +			bbio->bio.bi_status = errno_to_blk_status(ret); +			btrfs_dio_end_io(bbio);  			return;  		}  	} @@ -8227,7 +8344,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)  	int ret;  	struct btrfs_trans_handle *trans;  	u64 mask = fs_info->sectorsize - 1; -	u64 min_size = btrfs_calc_metadata_size(fs_info, 1); +	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);  	if (!skip_writeback) {  		ret = btrfs_wait_ordered_range(&inode->vfs_inode, @@ -8284,7 +8401,15 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)  	/* Migrate the slack space for the truncate to our reserve */  	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,  				      min_size, false); -	BUG_ON(ret); +	/* +	 * We have reserved 2 metadata units when we started the transaction and +	 * min_size matches 1 unit, so this should never fail, but if it does, +	 * it's not critical we just fail truncation. +	 */ +	if (WARN_ON(ret)) { +		btrfs_end_transaction(trans); +		goto out; +	}  	trans->block_rsv = rsv; @@ -8332,7 +8457,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)  		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,  					      rsv, min_size, false); -		BUG_ON(ret);	/* shouldn't happen */ +		/* +		 * We have reserved 2 metadata units when we started the +		 * transaction and min_size matches 1 unit, so this should never +		 * fail, but if it does, it's not critical we just fail truncation. +		 */ +		if (WARN_ON(ret)) +			break; +  		trans->block_rsv = rsv;  	} @@ -8459,7 +8591,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->io_tree.inode = ei;  	extent_io_tree_init(fs_info, &ei->file_extent_tree,  			    IO_TREE_INODE_FILE_EXTENT); -	atomic_set(&ei->sync_writers, 0);  	mutex_init(&ei->log_mutex);  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);  	INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -8630,7 +8761,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,  	inode_bytes = inode_get_bytes(inode);  	spin_unlock(&BTRFS_I(inode)->lock);  	stat->blocks = (ALIGN(inode_bytes, blocksize) + -			ALIGN(delalloc_bytes, blocksize)) >> 9; +			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;  	return 0;  } @@ -8786,9 +8917,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,  	if (old_dentry->d_parent != new_dentry->d_parent) {  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), -				BTRFS_I(old_inode), 1); +					BTRFS_I(old_inode), true);  		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), -				BTRFS_I(new_inode), 1); +					BTRFS_I(new_inode), true);  	}  	/* src is a subvolume */ @@ -9054,7 +9185,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,  	if (old_dentry->d_parent != new_dentry->d_parent)  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), -				BTRFS_I(old_inode), 1); +					BTRFS_I(old_inode), true);  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); @@ -10161,6 +10292,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	struct extent_io_tree *io_tree = &inode->io_tree;  	struct extent_changeset *data_reserved = NULL;  	struct extent_state *cached_state = NULL; +	struct btrfs_ordered_extent *ordered;  	int compression;  	size_t orig_count;  	u64 start, end; @@ -10337,14 +10469,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	}  	free_extent_map(em); -	ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, +	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,  				       ins.objectid, ins.offset,  				       encoded->unencoded_offset,  				       (1 << BTRFS_ORDERED_ENCODED) |  				       (1 << BTRFS_ORDERED_COMPRESSED),  				       compression); -	if (ret) { +	if (IS_ERR(ordered)) {  		btrfs_drop_extent_map_range(inode, start, end, false); +		ret = PTR_ERR(ordered);  		goto out_free_reserved;  	}  	btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10356,8 +10489,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	btrfs_delalloc_release_extents(inode, num_bytes); -	btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, -					  ins.offset, pages, nr_pages, 0, false); +	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);  	ret = orig_count;  	goto out; @@ -10894,7 +11026,6 @@ static const struct address_space_operations btrfs_aops = {  	.read_folio	= btrfs_read_folio,  	.writepages	= btrfs_writepages,  	.readahead	= btrfs_readahead, -	.direct_IO	= noop_direct_IO,  	.invalidate_folio = btrfs_invalidate_folio,  	.release_folio	= btrfs_release_folio,  	.migrate_folio	= btrfs_migrate_folio,  |