diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 1461 | 
1 files changed, 317 insertions, 1144 deletions
| diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 753db965f7c0..01eab6955647 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,31 +70,13 @@  #include "orphan.h"  #include "backref.h"  #include "raid-stripe-tree.h" +#include "fiemap.h"  struct btrfs_iget_args {  	u64 ino;  	struct btrfs_root *root;  }; -struct btrfs_dio_data { -	ssize_t submitted; -	struct extent_changeset *data_reserved; -	struct btrfs_ordered_extent *ordered; -	bool data_space_reserved; -	bool nocow_done; -}; - -struct btrfs_dio_private { -	/* Range of I/O */ -	u64 file_offset; -	u32 bytes; - -	/* This must be last */ -	struct btrfs_bio bbio; -}; - -static struct bio_set btrfs_dio_bioset; -  struct btrfs_rename_ctx {  	/* Output field. Stores the index number of the old directory entry. */  	u64 index; @@ -137,11 +119,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,  				     struct page *locked_page, u64 start,  				     u64 end, struct writeback_control *wbc,  				     bool pages_dirty); -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, -				       u64 len, u64 orig_start, u64 block_start, -				       u64 block_len, u64 orig_block_len, -				       u64 ram_bytes, int compress_type, -				       int type);  static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,  					  u64 root, void *warn_ctx) @@ -877,7 +854,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,  	if (btrfs_test_opt(fs_info, COMPRESS) ||  	    inode->flags & BTRFS_INODE_COMPRESS ||  	    inode->prop_compress) -		return btrfs_compress_heuristic(&inode->vfs_inode, start, end); +		return btrfs_compress_heuristic(inode, start, end);  	return 0;  } @@ -890,6 +867,26 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,  		btrfs_add_inode_defrag(NULL, inode, small_write);  } +static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ +	unsigned long end_index = end >> PAGE_SHIFT; +	struct page *page; +	int ret = 0; + +	for (unsigned long index = start >> PAGE_SHIFT; +	     index <= end_index; index++) { +		page = find_get_page(inode->i_mapping, index); +		if (unlikely(!page)) { +			if (!ret) +				ret = -ENOENT; +			continue; +		} +		clear_page_dirty_for_io(page); +		put_page(page); +	} +	return ret; +} +  /*   * Work queue call back to started compression on a file and pages.   * @@ -931,7 +928,16 @@ static void compress_file_range(struct btrfs_work *work)  	 * Otherwise applications with the file mmap'd can wander in and change  	 * the page contents while we are compressing them.  	 */ -	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); +	ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + +	/* +	 * All the folios should have been locked thus no failure. +	 * +	 * And even if some folios are missing, btrfs_compress_folios() +	 * would handle them correctly, so here just do an ASSERT() check for +	 * early logic errors. +	 */ +	ASSERT(ret == 0);  	/*  	 * We need to save i_size before now because it could change in between @@ -1152,6 +1158,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  	struct btrfs_root *root = inode->root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_ordered_extent *ordered; +	struct btrfs_file_extent file_extent;  	struct btrfs_key ins;  	struct page *locked_page = NULL;  	struct extent_state *cached = NULL; @@ -1198,29 +1205,22 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,  	lock_extent(io_tree, start, end, &cached);  	/* Here we're doing allocation and writeback of the compressed pages */ -	em = create_io_em(inode, start, -			  async_extent->ram_size,	/* len */ -			  start,			/* orig_start */ -			  ins.objectid,			/* block_start */ -			  ins.offset,			/* block_len */ -			  ins.offset,			/* orig_block_len */ -			  async_extent->ram_size,	/* ram_bytes */ -			  async_extent->compress_type, -			  BTRFS_ORDERED_COMPRESSED); +	file_extent.disk_bytenr = ins.objectid; +	file_extent.disk_num_bytes = ins.offset; +	file_extent.ram_bytes = async_extent->ram_size; +	file_extent.num_bytes = async_extent->ram_size; +	file_extent.offset = 0; +	file_extent.compression = async_extent->compress_type; + +	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);  	if (IS_ERR(em)) {  		ret = PTR_ERR(em);  		goto out_free_reserve;  	}  	free_extent_map(em); -	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */ -				       async_extent->ram_size,	/* num_bytes */ -				       async_extent->ram_size,	/* ram_bytes */ -				       ins.objectid,		/* disk_bytenr */ -				       ins.offset,		/* disk_num_bytes */ -				       0,			/* offset */ -				       1 << BTRFS_ORDERED_COMPRESSED, -				       async_extent->compress_type); +	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, +					     1 << BTRFS_ORDERED_COMPRESSED);  	if (IS_ERR(ordered)) {  		btrfs_drop_extent_map_range(inode, start, end, false);  		ret = PTR_ERR(ordered); @@ -1264,8 +1264,8 @@ out_free_reserve:  	kfree(async_extent);  } -static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, -				      u64 num_bytes) +u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, +				     u64 num_bytes)  {  	struct extent_map_tree *em_tree = &inode->extent_tree;  	struct extent_map *em; @@ -1279,15 +1279,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,  		 * first block in this inode and use that as a hint.  If that  		 * block is also bogus then just don't worry about it.  		 */ -		if (em->block_start >= EXTENT_MAP_LAST_BYTE) { +		if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {  			free_extent_map(em);  			em = search_extent_mapping(em_tree, 0, 0); -			if (em && em->block_start < EXTENT_MAP_LAST_BYTE) -				alloc_hint = em->block_start; +			if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) +				alloc_hint = extent_map_block_start(em);  			if (em)  				free_extent_map(em);  		} else { -			alloc_hint = em->block_start; +			alloc_hint = extent_map_block_start(em);  			free_extent_map(em);  		}  	} @@ -1375,7 +1375,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		}  	} -	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); +	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);  	/*  	 * Relocation relies on the relocated extents to have exactly the same @@ -1395,6 +1395,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  	while (num_bytes > 0) {  		struct btrfs_ordered_extent *ordered; +		struct btrfs_file_extent file_extent;  		cur_alloc_size = num_bytes;  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, @@ -1431,18 +1432,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		extent_reserved = true;  		ram_size = ins.offset; +		file_extent.disk_bytenr = ins.objectid; +		file_extent.disk_num_bytes = ins.offset; +		file_extent.num_bytes = ins.offset; +		file_extent.ram_bytes = ins.offset; +		file_extent.offset = 0; +		file_extent.compression = BTRFS_COMPRESS_NONE;  		lock_extent(&inode->io_tree, start, start + ram_size - 1,  			    &cached); -		em = create_io_em(inode, start, ins.offset, /* len */ -				  start, /* orig_start */ -				  ins.objectid, /* block_start */ -				  ins.offset, /* block_len */ -				  ins.offset, /* orig_block_len */ -				  ram_size, /* ram_bytes */ -				  BTRFS_COMPRESS_NONE, /* compress_type */ -				  BTRFS_ORDERED_REGULAR /* type */); +		em = btrfs_create_io_em(inode, start, &file_extent, +					BTRFS_ORDERED_REGULAR);  		if (IS_ERR(em)) {  			unlock_extent(&inode->io_tree, start,  				      start + ram_size - 1, &cached); @@ -1451,10 +1452,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,  		}  		free_extent_map(em); -		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, -					ram_size, ins.objectid, cur_alloc_size, -					0, 1 << BTRFS_ORDERED_REGULAR, -					BTRFS_COMPRESS_NONE); +		ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, +						     1 << BTRFS_ORDERED_REGULAR);  		if (IS_ERR(ordered)) {  			unlock_extent(&inode->io_tree, start,  				      start + ram_size - 1, &cached); @@ -1617,10 +1616,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_  	u64 alloc_hint = 0;  	if (do_free) { -		struct async_chunk *async_chunk;  		struct async_cow *async_cow; -		async_chunk = container_of(work, struct async_chunk, work);  		btrfs_add_delayed_iput(async_chunk->inode);  		if (async_chunk->blkcg_css)  			css_put(async_chunk->blkcg_css); @@ -1850,13 +1847,11 @@ struct can_nocow_file_extent_args {  	 */  	bool free_path; -	/* Output fields. Only set when can_nocow_file_extent() returns 1. */ - -	u64 disk_bytenr; -	u64 disk_num_bytes; -	u64 extent_offset; -	/* Number of bytes that can be written to in NOCOW mode. */ -	u64 num_bytes; +	/* +	 * Output fields. Only set when can_nocow_file_extent() returns 1. +	 * The expected file extent for the NOCOW write. +	 */ +	struct btrfs_file_extent file_extent;  };  /* @@ -1878,6 +1873,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	struct btrfs_root *root = inode->root;  	struct btrfs_file_extent_item *fi;  	struct btrfs_root *csum_root; +	u64 io_start;  	u64 extent_end;  	u8 extent_type;  	int can_nocow = 0; @@ -1890,11 +1886,6 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	if (extent_type == BTRFS_FILE_EXTENT_INLINE)  		goto out; -	/* Can't access these fields unless we know it's not an inline extent. */ -	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); -	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); -	args->extent_offset = btrfs_file_extent_offset(leaf, fi); -  	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&  	    extent_type == BTRFS_FILE_EXTENT_REG)  		goto out; @@ -1910,7 +1901,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,  		goto out;  	/* An explicit hole, must COW. */ -	if (args->disk_bytenr == 0) +	if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)  		goto out;  	/* Compressed/encrypted/encoded extents must be COWed. */ @@ -1921,6 +1912,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	extent_end = btrfs_file_extent_end(path); +	args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); +	args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); +	args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); +	args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); +	args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); +  	/*  	 * The following checks can be expensive, as they need to take other  	 * locks and do btree or rbtree searches, so release the path to avoid @@ -1929,8 +1926,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	btrfs_release_path(path);  	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), -				    key->offset - args->extent_offset, -				    args->disk_bytenr, args->strict, path); +				    key->offset - args->file_extent.offset, +				    args->file_extent.disk_bytenr, args->strict, path);  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);  	if (ret != 0)  		goto out; @@ -1951,18 +1948,18 @@ static int can_nocow_file_extent(struct btrfs_path *path,  	    atomic_read(&root->snapshot_force_cow))  		goto out; -	args->disk_bytenr += args->extent_offset; -	args->disk_bytenr += args->start - key->offset; -	args->num_bytes = min(args->end + 1, extent_end) - args->start; +	args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; +	args->file_extent.offset += args->start - key->offset; +	io_start = args->file_extent.disk_bytenr + args->file_extent.offset;  	/*  	 * Force COW if csums exist in the range. This ensures that csums for a  	 * given extent are either valid or do not exist.  	 */ -	csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr); -	ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr, -				      args->disk_bytenr + args->num_bytes - 1, +	csum_root = btrfs_csum_root(root->fs_info, io_start); +	ret = btrfs_lookup_csums_list(csum_root, io_start, +				      io_start + args->file_extent.num_bytes - 1,  				      NULL, nowait);  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);  	if (ret != 0) @@ -2021,7 +2018,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,  		struct extent_buffer *leaf;  		struct extent_state *cached_state = NULL;  		u64 extent_end; -		u64 ram_bytes;  		u64 nocow_end;  		int extent_type;  		bool is_prealloc; @@ -2100,7 +2096,6 @@ next_slot:  			ret = -EUCLEAN;  			goto error;  		} -		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);  		extent_end = btrfs_file_extent_end(path);  		/* @@ -2120,7 +2115,9 @@ next_slot:  			goto must_cow;  		ret = 0; -		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); +		nocow_bg = btrfs_inc_nocow_writers(fs_info, +				nocow_args.file_extent.disk_bytenr + +				nocow_args.file_extent.offset);  		if (!nocow_bg) {  must_cow:  			/* @@ -2156,21 +2153,16 @@ must_cow:  			}  		} -		nocow_end = cur_offset + nocow_args.num_bytes - 1; +		nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;  		lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);  		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;  		if (is_prealloc) { -			u64 orig_start = found_key.offset - nocow_args.extent_offset;  			struct extent_map *em; -			em = create_io_em(inode, cur_offset, nocow_args.num_bytes, -					  orig_start, -					  nocow_args.disk_bytenr, /* block_start */ -					  nocow_args.num_bytes, /* block_len */ -					  nocow_args.disk_num_bytes, /* orig_block_len */ -					  ram_bytes, BTRFS_COMPRESS_NONE, -					  BTRFS_ORDERED_PREALLOC); +			em = btrfs_create_io_em(inode, cur_offset, +						&nocow_args.file_extent, +						BTRFS_ORDERED_PREALLOC);  			if (IS_ERR(em)) {  				unlock_extent(&inode->io_tree, cur_offset,  					      nocow_end, &cached_state); @@ -2182,12 +2174,10 @@ must_cow:  		}  		ordered = btrfs_alloc_ordered_extent(inode, cur_offset, -				nocow_args.num_bytes, nocow_args.num_bytes, -				nocow_args.disk_bytenr, nocow_args.num_bytes, 0, +				&nocow_args.file_extent,  				is_prealloc  				? (1 << BTRFS_ORDERED_PREALLOC) -				: (1 << BTRFS_ORDERED_NOCOW), -				BTRFS_COMPRESS_NONE); +				: (1 << BTRFS_ORDERED_NOCOW));  		btrfs_dec_nocow_writers(nocow_bg);  		if (IS_ERR(ordered)) {  			if (is_prealloc) { @@ -2601,44 +2591,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,  	}  } -static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, -					struct btrfs_ordered_extent *ordered) -{ -	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; -	u64 len = bbio->bio.bi_iter.bi_size; -	struct btrfs_ordered_extent *new; -	int ret; - -	/* Must always be called for the beginning of an ordered extent. */ -	if (WARN_ON_ONCE(start != ordered->disk_bytenr)) -		return -EINVAL; - -	/* No need to split if the ordered extent covers the entire bio. */ -	if (ordered->disk_num_bytes == len) { -		refcount_inc(&ordered->refs); -		bbio->ordered = ordered; -		return 0; -	} - -	/* -	 * Don't split the extent_map for NOCOW extents, as we're writing into -	 * a pre-existing one. -	 */ -	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { -		ret = split_extent_map(bbio->inode, bbio->file_offset, -				       ordered->num_bytes, len, -				       ordered->disk_bytenr); -		if (ret) -			return ret; -	} - -	new = btrfs_split_ordered_extent(ordered, len); -	if (IS_ERR(new)) -		return PTR_ERR(new); -	bbio->ordered = new; -	return 0; -} -  /*   * given a list of ordered sums record them in the inode.  This happens   * at IO completion time based on sums calculated at bio submission time. @@ -2681,7 +2633,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,  		if (IS_ERR(em))  			return PTR_ERR(em); -		if (em->block_start != EXTENT_MAP_HOLE) +		if (em->disk_bytenr != EXTENT_MAP_HOLE)  			goto next;  		em_len = em->len; @@ -3037,10 +2989,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,  	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,  						   oe->disk_num_bytes);  	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); -	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { +	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))  		num_bytes = oe->truncated_len; -		ram_bytes = num_bytes; -	}  	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);  	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);  	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3056,7 +3006,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,  			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||  			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); -	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), +	return insert_reserved_file_extent(trans, oe->inode,  					   oe->file_offset, &stack_fi,  					   update_inode_bytes, oe->qgroup_rsv);  } @@ -3068,7 +3018,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,   */  int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)  { -	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); +	struct btrfs_inode *inode = ordered_extent->inode;  	struct btrfs_root *root = inode->root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_trans_handle *trans = NULL; @@ -3302,7 +3252,7 @@ out:  int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)  { -	if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) && +	if (btrfs_is_zoned(ordered->inode->root->fs_info) &&  	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&  	    list_empty(&ordered->bioc_list))  		btrfs_finish_ordered_zoned(ordered); @@ -3596,7 +3546,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)  		found_key.objectid = found_key.offset;  		found_key.type = BTRFS_INODE_ITEM_KEY;  		found_key.offset = 0; -		inode = btrfs_iget(fs_info->sb, last_objectid, root); +		inode = btrfs_iget(last_objectid, root);  		if (IS_ERR(inode)) {  			ret = PTR_ERR(inode);  			inode = NULL; @@ -3781,6 +3731,30 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,  	return 1;  } +static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) +{ +	struct btrfs_fs_info *fs_info = inode->root->fs_info; + +	if (WARN_ON_ONCE(inode->file_extent_tree)) +		return 0; +	if (btrfs_fs_incompat(fs_info, NO_HOLES)) +		return 0; +	if (!S_ISREG(inode->vfs_inode.i_mode)) +		return 0; +	if (btrfs_is_free_space_inode(inode)) +		return 0; + +	inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); +	if (!inode->file_extent_tree) +		return -ENOMEM; + +	extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); +	/* Lockdep class is set only for the file extent tree. */ +	lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); + +	return 0; +} +  /*   * read an inode from the btree into the in-memory inode   */ @@ -3800,6 +3774,10 @@ static int btrfs_read_locked_inode(struct inode *inode,  	bool filled = false;  	int first_xattr_slot; +	ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); +	if (ret) +		return ret; +  	ret = btrfs_fill_inode(inode, &rdev);  	if (!ret)  		filled = true; @@ -3810,7 +3788,7 @@ static int btrfs_read_locked_inode(struct inode *inode,  			return -ENOMEM;  	} -	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); +	btrfs_get_inode_key(BTRFS_I(inode), &location);  	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);  	if (ret) { @@ -3856,7 +3834,9 @@ static int btrfs_read_locked_inode(struct inode *inode,  	inode->i_rdev = 0;  	rdev = btrfs_inode_rdev(leaf, inode_item); -	BTRFS_I(inode)->index_cnt = (u64)-1; +	if (S_ISDIR(inode->i_mode)) +		BTRFS_I(inode)->index_cnt = (u64)-1; +  	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),  				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); @@ -4038,13 +4018,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,  	struct btrfs_inode_item *inode_item;  	struct btrfs_path *path;  	struct extent_buffer *leaf; +	struct btrfs_key key;  	int ret;  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; -	ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); +	btrfs_get_inode_key(inode, &key); +	ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);  	if (ret) {  		if (ret > 0)  			ret = -ENOENT; @@ -4308,7 +4290,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,  	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {  		objectid = btrfs_root_id(inode->root);  	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { -		objectid = inode->location.objectid; +		objectid = inode->ref_root_id;  	} else {  		WARN_ON(1);  		fscrypt_free_filename(&fname); @@ -4539,11 +4521,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)  		ret = PTR_ERR(trans);  		goto out_release;  	} -	ret = btrfs_record_root_in_trans(trans, root); -	if (ret) { -		btrfs_abort_transaction(trans, ret); -		goto out_end_trans; -	}  	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);  	qgroup_reserved = 0;  	trans->block_rsv = &block_rsv; @@ -4967,11 +4944,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)  			}  			hole_em->start = cur_offset;  			hole_em->len = hole_size; -			hole_em->orig_start = cur_offset; -			hole_em->block_start = EXTENT_MAP_HOLE; -			hole_em->block_len = 0; -			hole_em->orig_block_len = 0; +			hole_em->disk_bytenr = EXTENT_MAP_HOLE; +			hole_em->disk_num_bytes = 0;  			hole_em->ram_bytes = hole_size;  			hole_em->generation = btrfs_get_fs_generation(fs_info); @@ -5049,7 +5024,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  		struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);  		if (btrfs_is_zoned(fs_info)) { -			ret = btrfs_wait_ordered_range(inode, +			ret = btrfs_wait_ordered_range(BTRFS_I(inode),  					ALIGN(newsize, fs_info->sectorsize),  					(u64)-1);  			if (ret) @@ -5079,7 +5054,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  			 * wait for disk_i_size to be stable and then update the  			 * in-memory size to match.  			 */ -			err = btrfs_wait_ordered_range(inode, 0, (u64)-1); +			err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);  			if (err)  				return err;  			i_size_write(inode, BTRFS_I(inode)->disk_i_size); @@ -5493,59 +5468,52 @@ out:  	return err;  } -static void inode_tree_add(struct btrfs_inode *inode) +static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)  {  	struct btrfs_root *root = inode->root; -	struct btrfs_inode *entry; -	struct rb_node **p; -	struct rb_node *parent; -	struct rb_node *new = &inode->rb_node; -	u64 ino = btrfs_ino(inode); +	struct btrfs_inode *existing; +	const u64 ino = btrfs_ino(inode); +	int ret;  	if (inode_unhashed(&inode->vfs_inode)) -		return; -	parent = NULL; -	spin_lock(&root->inode_lock); -	p = &root->inode_tree.rb_node; -	while (*p) { -		parent = *p; -		entry = rb_entry(parent, struct btrfs_inode, rb_node); +		return 0; -		if (ino < btrfs_ino(entry)) -			p = &parent->rb_left; -		else if (ino > btrfs_ino(entry)) -			p = &parent->rb_right; -		else { -			WARN_ON(!(entry->vfs_inode.i_state & -				  (I_WILL_FREE | I_FREEING))); -			rb_replace_node(parent, new, &root->inode_tree); -			RB_CLEAR_NODE(parent); -			spin_unlock(&root->inode_lock); -			return; -		} +	if (prealloc) { +		ret = xa_reserve(&root->inodes, ino, GFP_NOFS); +		if (ret) +			return ret;  	} -	rb_link_node(new, parent, p); -	rb_insert_color(new, &root->inode_tree); -	spin_unlock(&root->inode_lock); + +	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC); + +	if (xa_is_err(existing)) { +		ret = xa_err(existing); +		ASSERT(ret != -EINVAL); +		ASSERT(ret != -ENOMEM); +		return ret; +	} else if (existing) { +		WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); +	} + +	return 0;  } -static void inode_tree_del(struct btrfs_inode *inode) +static void btrfs_del_inode_from_root(struct btrfs_inode *inode)  {  	struct btrfs_root *root = inode->root; -	int empty = 0; +	struct btrfs_inode *entry; +	bool empty = false; -	spin_lock(&root->inode_lock); -	if (!RB_EMPTY_NODE(&inode->rb_node)) { -		rb_erase(&inode->rb_node, &root->inode_tree); -		RB_CLEAR_NODE(&inode->rb_node); -		empty = RB_EMPTY_ROOT(&root->inode_tree); -	} -	spin_unlock(&root->inode_lock); +	xa_lock(&root->inodes); +	entry = __xa_erase(&root->inodes, btrfs_ino(inode)); +	if (entry == inode) +		empty = xa_empty(&root->inodes); +	xa_unlock(&root->inodes);  	if (empty && btrfs_root_refs(&root->root_item) == 0) { -		spin_lock(&root->inode_lock); -		empty = RB_EMPTY_ROOT(&root->inode_tree); -		spin_unlock(&root->inode_lock); +		xa_lock(&root->inodes); +		empty = xa_empty(&root->inodes); +		xa_unlock(&root->inodes);  		if (empty)  			btrfs_add_dead_root(root);  	} @@ -5556,10 +5524,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)  {  	struct btrfs_iget_args *args = p; -	inode->i_ino = args->ino; -	BTRFS_I(inode)->location.objectid = args->ino; -	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; -	BTRFS_I(inode)->location.offset = 0; +	btrfs_set_inode_number(BTRFS_I(inode), args->ino);  	BTRFS_I(inode)->root = btrfs_grab_root(args->root);  	if (args->root && args->root == args->root->fs_info->tree_root && @@ -5573,12 +5538,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)  {  	struct btrfs_iget_args *args = opaque; -	return args->ino == BTRFS_I(inode)->location.objectid && +	return args->ino == btrfs_ino(BTRFS_I(inode)) &&  		args->root == BTRFS_I(inode)->root;  } -static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, -				       struct btrfs_root *root) +static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)  {  	struct inode *inode;  	struct btrfs_iget_args args; @@ -5587,7 +5551,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,  	args.ino = ino;  	args.root = root; -	inode = iget5_locked(s, hashval, btrfs_find_actor, +	inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,  			     btrfs_init_locked_inode,  			     (void *)&args);  	return inode; @@ -5599,41 +5563,44 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,   * allocator. NULL is also valid but may require an additional allocation   * later.   */ -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, -			      struct btrfs_root *root, struct btrfs_path *path) +struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, +			      struct btrfs_path *path)  {  	struct inode *inode; +	int ret; -	inode = btrfs_iget_locked(s, ino, root); +	inode = btrfs_iget_locked(ino, root);  	if (!inode)  		return ERR_PTR(-ENOMEM); -	if (inode->i_state & I_NEW) { -		int ret; +	if (!(inode->i_state & I_NEW)) +		return inode; -		ret = btrfs_read_locked_inode(inode, path); -		if (!ret) { -			inode_tree_add(BTRFS_I(inode)); -			unlock_new_inode(inode); -		} else { -			iget_failed(inode); -			/* -			 * ret > 0 can come from btrfs_search_slot called by -			 * btrfs_read_locked_inode, this means the inode item -			 * was not found. -			 */ -			if (ret > 0) -				ret = -ENOENT; -			inode = ERR_PTR(ret); -		} -	} +	ret = btrfs_read_locked_inode(inode, path); +	/* +	 * ret > 0 can come from btrfs_search_slot called by +	 * btrfs_read_locked_inode(), this means the inode item was not found. +	 */ +	if (ret > 0) +		ret = -ENOENT; +	if (ret < 0) +		goto error; + +	ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); +	if (ret < 0) +		goto error; + +	unlock_new_inode(inode);  	return inode; +error: +	iget_failed(inode); +	return ERR_PTR(ret);  } -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) +struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)  { -	return btrfs_iget_path(s, ino, root, NULL); +	return btrfs_iget_path(ino, root, NULL);  }  static struct inode *new_simple_dir(struct inode *dir, @@ -5647,10 +5614,11 @@ static struct inode *new_simple_dir(struct inode *dir,  		return ERR_PTR(-ENOMEM);  	BTRFS_I(inode)->root = btrfs_grab_root(root); -	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); +	BTRFS_I(inode)->ref_root_id = key->objectid; +	set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); -	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; +	btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);  	/*  	 * We only need lookup, the rest is read-only and there's no inode  	 * associated with the dentry @@ -5704,7 +5672,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  		return ERR_PTR(ret);  	if (location.type == BTRFS_INODE_ITEM_KEY) { -		inode = btrfs_iget(dir->i_sb, location.objectid, root); +		inode = btrfs_iget(location.objectid, root);  		if (IS_ERR(inode))  			return inode; @@ -5728,7 +5696,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)  		else  			inode = new_simple_dir(dir, &location, root);  	} else { -		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); +		inode = btrfs_iget(location.objectid, sub_root);  		btrfs_put_root(sub_root);  		if (IS_ERR(inode)) @@ -5948,7 +5916,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)  	addr = private->filldir_buf;  	path->reada = READA_FORWARD; -	put = btrfs_readdir_get_delayed_items(inode, private->last_index, +	put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,  					      &ins_list, &del_list);  again: @@ -6038,7 +6006,7 @@ nopos:  	ret = 0;  err:  	if (put) -		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); +		btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);  	btrfs_free_path(path);  	return ret;  } @@ -6123,7 +6091,7 @@ static int btrfs_insert_inode_locked(struct inode *inode)  {  	struct btrfs_iget_args args; -	args.ino = BTRFS_I(inode)->location.objectid; +	args.ino = btrfs_ino(BTRFS_I(inode));  	args.root = BTRFS_I(inode)->root;  	return insert_inode_locked4(inode, @@ -6230,7 +6198,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);  	struct btrfs_root *root;  	struct btrfs_inode_item *inode_item; -	struct btrfs_key *location;  	struct btrfs_path *path;  	u64 objectid;  	struct btrfs_inode_ref *ref; @@ -6239,6 +6206,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  	struct btrfs_item_batch batch;  	unsigned long ptr;  	int ret; +	bool xa_reserved = false;  	path = btrfs_alloc_path();  	if (!path) @@ -6248,10 +6216,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);  	root = BTRFS_I(inode)->root; +	ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); +	if (ret) +		goto out; +  	ret = btrfs_get_free_objectid(root, &objectid);  	if (ret)  		goto out; -	inode->i_ino = objectid; +	btrfs_set_inode_number(BTRFS_I(inode), objectid); + +	ret = xa_reserve(&root->inodes, objectid, GFP_NOFS); +	if (ret) +		goto out; +	xa_reserved = true;  	if (args->orphan) {  		/* @@ -6266,8 +6243,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  		if (ret)  			goto out;  	} -	/* index_cnt is ignored for everything but a dir. */ -	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; + +	if (S_ISDIR(inode->i_mode)) +		BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; +  	BTRFS_I(inode)->generation = trans->transid;  	inode->i_generation = BTRFS_I(inode)->generation; @@ -6294,11 +6273,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  				BTRFS_INODE_NODATASUM;  	} -	location = &BTRFS_I(inode)->location; -	location->objectid = objectid; -	location->offset = 0; -	location->type = BTRFS_INODE_ITEM_KEY; -  	ret = btrfs_insert_inode_locked(inode);  	if (ret < 0) {  		if (!args->orphan) @@ -6397,8 +6371,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  		 * Subvolumes inherit properties from their parent subvolume,  		 * not the directory they were created in.  		 */ -		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, -				    BTRFS_I(dir)->root); +		parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);  		if (IS_ERR(parent)) {  			ret = PTR_ERR(parent);  		} else { @@ -6426,7 +6399,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,  		}  	} -	inode_tree_add(BTRFS_I(inode)); +	ret = btrfs_add_inode_to_root(BTRFS_I(inode), false); +	if (WARN_ON(ret)) { +		/* Shouldn't happen, we used xa_reserve() before. */ +		btrfs_abort_transaction(trans, ret); +		goto discard; +	}  	trace_btrfs_inode_new(inode);  	btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); @@ -6454,6 +6432,9 @@ discard:  	ihold(inode);  	discard_new_inode(inode);  out: +	if (xa_reserved) +		xa_release(&root->inodes, objectid); +  	btrfs_free_path(path);  	return ret;  } @@ -6818,7 +6799,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,  	if (em) {  		if (em->start > start || em->start + em->len <= start)  			free_extent_map(em); -		else if (em->block_start == EXTENT_MAP_INLINE && page) +		else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)  			free_extent_map(em);  		else  			goto out; @@ -6829,9 +6810,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,  		goto out;  	}  	em->start = EXTENT_MAP_HOLE; -	em->orig_start = EXTENT_MAP_HOLE; +	em->disk_bytenr = EXTENT_MAP_HOLE;  	em->len = (u64)-1; -	em->block_len = (u64)-1;  	path = btrfs_alloc_path();  	if (!path) { @@ -6921,9 +6901,8 @@ next:  		/* New extent overlaps with existing one */  		em->start = start; -		em->orig_start = start;  		em->len = found_key.offset - start; -		em->block_start = EXTENT_MAP_HOLE; +		em->disk_bytenr = EXTENT_MAP_HOLE;  		goto insert;  	} @@ -6947,7 +6926,7 @@ next:  		 *  		 * Other members are not utilized for inline extents.  		 */ -		ASSERT(em->block_start == EXTENT_MAP_INLINE); +		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);  		ASSERT(em->len == fs_info->sectorsize);  		ret = read_inline_extent(inode, path, page); @@ -6957,9 +6936,8 @@ next:  	}  not_found:  	em->start = start; -	em->orig_start = start;  	em->len = len; -	em->block_start = EXTENT_MAP_HOLE; +	em->disk_bytenr = EXTENT_MAP_HOLE;  insert:  	ret = 0;  	btrfs_release_path(path); @@ -6986,84 +6964,6 @@ out:  	return em;  } -static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, -						  struct btrfs_dio_data *dio_data, -						  const u64 start, -						  const u64 len, -						  const u64 orig_start, -						  const u64 block_start, -						  const u64 block_len, -						  const u64 orig_block_len, -						  const u64 ram_bytes, -						  const int type) -{ -	struct extent_map *em = NULL; -	struct btrfs_ordered_extent *ordered; - -	if (type != BTRFS_ORDERED_NOCOW) { -		em = create_io_em(inode, start, len, orig_start, block_start, -				  block_len, orig_block_len, ram_bytes, -				  BTRFS_COMPRESS_NONE, /* compress_type */ -				  type); -		if (IS_ERR(em)) -			goto out; -	} -	ordered = btrfs_alloc_ordered_extent(inode, start, len, len, -					     block_start, block_len, 0, -					     (1 << type) | -					     (1 << BTRFS_ORDERED_DIRECT), -					     BTRFS_COMPRESS_NONE); -	if (IS_ERR(ordered)) { -		if (em) { -			free_extent_map(em); -			btrfs_drop_extent_map_range(inode, start, -						    start + len - 1, false); -		} -		em = ERR_CAST(ordered); -	} else { -		ASSERT(!dio_data->ordered); -		dio_data->ordered = ordered; -	} - out: - -	return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, -						  struct btrfs_dio_data *dio_data, -						  u64 start, u64 len) -{ -	struct btrfs_root *root = inode->root; -	struct btrfs_fs_info *fs_info = root->fs_info; -	struct extent_map *em; -	struct btrfs_key ins; -	u64 alloc_hint; -	int ret; - -	alloc_hint = get_extent_allocation_hint(inode, start, len); -again: -	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, -				   0, alloc_hint, &ins, 1, 1); -	if (ret == -EAGAIN) { -		ASSERT(btrfs_is_zoned(fs_info)); -		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, -			       TASK_UNINTERRUPTIBLE); -		goto again; -	} -	if (ret) -		return ERR_PTR(ret); - -	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, -				     ins.objectid, ins.offset, ins.offset, -				     ins.offset, BTRFS_ORDERED_REGULAR); -	btrfs_dec_block_group_reservations(fs_info, ins.objectid); -	if (IS_ERR(em)) -		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, -					   1); - -	return em; -} -  static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)  {  	struct btrfs_block_group *block_group; @@ -7098,8 +6998,8 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)   *	 any ordered extents.   */  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, -			      u64 *orig_start, u64 *orig_block_len, -			      u64 *ram_bytes, bool nowait, bool strict) +			      struct btrfs_file_extent *file_extent, +			      bool nowait, bool strict)  {  	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);  	struct can_nocow_file_extent_args nocow_args = { 0 }; @@ -7149,8 +7049,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);  	found_type = btrfs_file_extent_type(leaf, fi); -	if (ram_bytes) -		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);  	nocow_args.start = offset;  	nocow_args.end = offset + *len - 1; @@ -7168,14 +7066,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  	}  	ret = 0; -	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) +	if (btrfs_extent_readonly(fs_info, +				  nocow_args.file_extent.disk_bytenr + +				  nocow_args.file_extent.offset))  		goto out;  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {  		u64 range_end; -		range_end = round_up(offset + nocow_args.num_bytes, +		range_end = round_up(offset + nocow_args.file_extent.num_bytes,  				     root->fs_info->sectorsize) - 1;  		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);  		if (ret) { @@ -7184,117 +7084,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,  		}  	} -	if (orig_start) -		*orig_start = key.offset - nocow_args.extent_offset; -	if (orig_block_len) -		*orig_block_len = nocow_args.disk_num_bytes; +	if (file_extent) +		memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); -	*len = nocow_args.num_bytes; +	*len = nocow_args.file_extent.num_bytes;  	ret = 1;  out:  	btrfs_free_path(path);  	return ret;  } -static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, -			      struct extent_state **cached_state, -			      unsigned int iomap_flags) -{ -	const bool writing = (iomap_flags & IOMAP_WRITE); -	const bool nowait = (iomap_flags & IOMAP_NOWAIT); -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -	struct btrfs_ordered_extent *ordered; -	int ret = 0; - -	while (1) { -		if (nowait) { -			if (!try_lock_extent(io_tree, lockstart, lockend, -					     cached_state)) -				return -EAGAIN; -		} else { -			lock_extent(io_tree, lockstart, lockend, cached_state); -		} -		/* -		 * We're concerned with the entire range that we're going to be -		 * doing DIO to, so we need to make sure there's no ordered -		 * extents in this range. -		 */ -		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, -						     lockend - lockstart + 1); - -		/* -		 * We need to make sure there are no buffered pages in this -		 * range either, we could have raced between the invalidate in -		 * generic_file_direct_write and locking the extent.  The -		 * invalidate needs to happen so that reads after a write do not -		 * get stale data. -		 */ -		if (!ordered && -		    (!writing || !filemap_range_has_page(inode->i_mapping, -							 lockstart, lockend))) -			break; - -		unlock_extent(io_tree, lockstart, lockend, cached_state); - -		if (ordered) { -			if (nowait) { -				btrfs_put_ordered_extent(ordered); -				ret = -EAGAIN; -				break; -			} -			/* -			 * If we are doing a DIO read and the ordered extent we -			 * found is for a buffered write, we can not wait for it -			 * to complete and retry, because if we do so we can -			 * deadlock with concurrent buffered writes on page -			 * locks. This happens only if our DIO read covers more -			 * than one extent map, if at this point has already -			 * created an ordered extent for a previous extent map -			 * and locked its range in the inode's io tree, and a -			 * concurrent write against that previous extent map's -			 * range and this range started (we unlock the ranges -			 * in the io tree only when the bios complete and -			 * buffered writes always lock pages before attempting -			 * to lock range in the io tree). -			 */ -			if (writing || -			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) -				btrfs_start_ordered_extent(ordered); -			else -				ret = nowait ? -EAGAIN : -ENOTBLK; -			btrfs_put_ordered_extent(ordered); -		} else { -			/* -			 * We could trigger writeback for this range (and wait -			 * for it to complete) and then invalidate the pages for -			 * this range (through invalidate_inode_pages2_range()), -			 * but that can lead us to a deadlock with a concurrent -			 * call to readahead (a buffered read or a defrag call -			 * triggered a readahead) on a page lock due to an -			 * ordered dio extent we created before but did not have -			 * yet a corresponding bio submitted (whence it can not -			 * complete), which makes readahead wait for that -			 * ordered extent to complete while holding a lock on -			 * that page. -			 */ -			ret = nowait ? -EAGAIN : -ENOTBLK; -		} - -		if (ret) -			break; - -		cond_resched(); -	} - -	return ret; -} -  /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, -				       u64 len, u64 orig_start, u64 block_start, -				       u64 block_len, u64 orig_block_len, -				       u64 ram_bytes, int compress_type, -				       int type) +struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, +				      const struct btrfs_file_extent *file_extent, +				      int type)  {  	struct extent_map *em;  	int ret; @@ -7313,32 +7116,26 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,  	switch (type) {  	case BTRFS_ORDERED_PREALLOC: -		/* Uncompressed extents. */ -		ASSERT(block_len == len); -  		/* We're only referring part of a larger preallocated extent. */ -		ASSERT(block_len <= ram_bytes); +		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);  		break;  	case BTRFS_ORDERED_REGULAR: -		/* Uncompressed extents. */ -		ASSERT(block_len == len); -  		/* COW results a new extent matching our file extent size. */ -		ASSERT(orig_block_len == len); -		ASSERT(ram_bytes == len); +		ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes); +		ASSERT(file_extent->ram_bytes == file_extent->num_bytes);  		/* Since it's a new extent, we should not have any offset. */ -		ASSERT(orig_start == start); +		ASSERT(file_extent->offset == 0);  		break;  	case BTRFS_ORDERED_COMPRESSED:  		/* Must be compressed. */ -		ASSERT(compress_type != BTRFS_COMPRESS_NONE); +		ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);  		/*  		 * Encoded write can make us to refer to part of the  		 * uncompressed extent.  		 */ -		ASSERT(len <= ram_bytes); +		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);  		break;  	} @@ -7347,16 +7144,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,  		return ERR_PTR(-ENOMEM);  	em->start = start; -	em->orig_start = orig_start; -	em->len = len; -	em->block_len = block_len; -	em->block_start = block_start; -	em->orig_block_len = orig_block_len; -	em->ram_bytes = ram_bytes; +	em->len = file_extent->num_bytes; +	em->disk_bytenr = file_extent->disk_bytenr; +	em->disk_num_bytes = file_extent->disk_num_bytes; +	em->ram_bytes = file_extent->ram_bytes;  	em->generation = -1; +	em->offset = file_extent->offset;  	em->flags |= EXTENT_FLAG_PINNED;  	if (type == BTRFS_ORDERED_COMPRESSED) -		extent_map_set_compression(em, compress_type); +		extent_map_set_compression(em, file_extent->compression);  	ret = btrfs_replace_extent_map_range(inode, em, true);  	if (ret) { @@ -7368,580 +7164,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,  	return em;  } - -static int btrfs_get_blocks_direct_write(struct extent_map **map, -					 struct inode *inode, -					 struct btrfs_dio_data *dio_data, -					 u64 start, u64 *lenp, -					 unsigned int iomap_flags) -{ -	const bool nowait = (iomap_flags & IOMAP_NOWAIT); -	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); -	struct extent_map *em = *map; -	int type; -	u64 block_start, orig_start, orig_block_len, ram_bytes; -	struct btrfs_block_group *bg; -	bool can_nocow = false; -	bool space_reserved = false; -	u64 len = *lenp; -	u64 prev_len; -	int ret = 0; - -	/* -	 * We don't allocate a new extent in the following cases -	 * -	 * 1) The inode is marked as NODATACOW. In this case we'll just use the -	 * existing extent. -	 * 2) The extent is marked as PREALLOC. We're good to go here and can -	 * just use the extent. -	 * -	 */ -	if ((em->flags & EXTENT_FLAG_PREALLOC) || -	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && -	     em->block_start != EXTENT_MAP_HOLE)) { -		if (em->flags & EXTENT_FLAG_PREALLOC) -			type = BTRFS_ORDERED_PREALLOC; -		else -			type = BTRFS_ORDERED_NOCOW; -		len = min(len, em->len - (start - em->start)); -		block_start = em->block_start + (start - em->start); - -		if (can_nocow_extent(inode, start, &len, &orig_start, -				     &orig_block_len, &ram_bytes, false, false) == 1) { -			bg = btrfs_inc_nocow_writers(fs_info, block_start); -			if (bg) -				can_nocow = true; -		} -	} - -	prev_len = len; -	if (can_nocow) { -		struct extent_map *em2; - -		/* We can NOCOW, so only need to reserve metadata space. */ -		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, -						      nowait); -		if (ret < 0) { -			/* Our caller expects us to free the input extent map. */ -			free_extent_map(em); -			*map = NULL; -			btrfs_dec_nocow_writers(bg); -			if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) -				ret = -EAGAIN; -			goto out; -		} -		space_reserved = true; - -		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, -					      orig_start, block_start, -					      len, orig_block_len, -					      ram_bytes, type); -		btrfs_dec_nocow_writers(bg); -		if (type == BTRFS_ORDERED_PREALLOC) { -			free_extent_map(em); -			*map = em2; -			em = em2; -		} - -		if (IS_ERR(em2)) { -			ret = PTR_ERR(em2); -			goto out; -		} - -		dio_data->nocow_done = true; -	} else { -		/* Our caller expects us to free the input extent map. */ -		free_extent_map(em); -		*map = NULL; - -		if (nowait) { -			ret = -EAGAIN; -			goto out; -		} - -		/* -		 * If we could not allocate data space before locking the file -		 * range and we can't do a NOCOW write, then we have to fail. -		 */ -		if (!dio_data->data_space_reserved) { -			ret = -ENOSPC; -			goto out; -		} - -		/* -		 * We have to COW and we have already reserved data space before, -		 * so now we reserve only metadata. -		 */ -		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, -						      false); -		if (ret < 0) -			goto out; -		space_reserved = true; - -		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); -		if (IS_ERR(em)) { -			ret = PTR_ERR(em); -			goto out; -		} -		*map = em; -		len = min(len, em->len - (start - em->start)); -		if (len < prev_len) -			btrfs_delalloc_release_metadata(BTRFS_I(inode), -							prev_len - len, true); -	} - -	/* -	 * We have created our ordered extent, so we can now release our reservation -	 * for an outstanding extent. -	 */ -	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); - -	/* -	 * Need to update the i_size under the extent lock so buffered -	 * readers will get the updated i_size when we unlock. -	 */ -	if (start + len > i_size_read(inode)) -		i_size_write(inode, start + len); -out: -	if (ret && space_reserved) { -		btrfs_delalloc_release_extents(BTRFS_I(inode), len); -		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); -	} -	*lenp = len; -	return ret; -} - -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, -		loff_t length, unsigned int flags, struct iomap *iomap, -		struct iomap *srcmap) -{ -	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); -	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); -	struct extent_map *em; -	struct extent_state *cached_state = NULL; -	struct btrfs_dio_data *dio_data = iter->private; -	u64 lockstart, lockend; -	const bool write = !!(flags & IOMAP_WRITE); -	int ret = 0; -	u64 len = length; -	const u64 data_alloc_len = length; -	bool unlock_extents = false; - -	/* -	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if -	 * we're NOWAIT we may submit a bio for a partial range and return -	 * EIOCBQUEUED, which would result in an errant short read. -	 * -	 * The best way to handle this would be to allow for partial completions -	 * of iocb's, so we could submit the partial bio, return and fault in -	 * the rest of the pages, and then submit the io for the rest of the -	 * range.  However we don't have that currently, so simply return -	 * -EAGAIN at this point so that the normal path is used. -	 */ -	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) -		return -EAGAIN; - -	/* -	 * Cap the size of reads to that usually seen in buffered I/O as we need -	 * to allocate a contiguous array for the checksums. -	 */ -	if (!write) -		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); - -	lockstart = start; -	lockend = start + len - 1; - -	/* -	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't -	 * enough if we've written compressed pages to this area, so we need to -	 * flush the dirty pages again to make absolutely sure that any -	 * outstanding dirty pages are on disk - the first flush only starts -	 * compression on the data, while keeping the pages locked, so by the -	 * time the second flush returns we know bios for the compressed pages -	 * were submitted and finished, and the pages no longer under writeback. -	 * -	 * If we have a NOWAIT request and we have any pages in the range that -	 * are locked, likely due to compression still in progress, we don't want -	 * to block on page locks. We also don't want to block on pages marked as -	 * dirty or under writeback (same as for the non-compression case). -	 * iomap_dio_rw() did the same check, but after that and before we got -	 * here, mmap'ed writes may have happened or buffered reads started -	 * (readpage() and readahead(), which lock pages), as we haven't locked -	 * the file range yet. -	 */ -	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -		     &BTRFS_I(inode)->runtime_flags)) { -		if (flags & IOMAP_NOWAIT) { -			if (filemap_range_needs_writeback(inode->i_mapping, -							  lockstart, lockend)) -				return -EAGAIN; -		} else { -			ret = filemap_fdatawrite_range(inode->i_mapping, start, -						       start + length - 1); -			if (ret) -				return ret; -		} -	} - -	memset(dio_data, 0, sizeof(*dio_data)); - -	/* -	 * We always try to allocate data space and must do it before locking -	 * the file range, to avoid deadlocks with concurrent writes to the same -	 * range if the range has several extents and the writes don't expand the -	 * current i_size (the inode lock is taken in shared mode). If we fail to -	 * allocate data space here we continue and later, after locking the -	 * file range, we fail with ENOSPC only if we figure out we can not do a -	 * NOCOW write. -	 */ -	if (write && !(flags & IOMAP_NOWAIT)) { -		ret = btrfs_check_data_free_space(BTRFS_I(inode), -						  &dio_data->data_reserved, -						  start, data_alloc_len, false); -		if (!ret) -			dio_data->data_space_reserved = true; -		else if (ret && !(BTRFS_I(inode)->flags & -				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) -			goto err; -	} - -	/* -	 * If this errors out it's because we couldn't invalidate pagecache for -	 * this range and we need to fallback to buffered IO, or we are doing a -	 * NOWAIT read/write and we need to block. -	 */ -	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); -	if (ret < 0) -		goto err; - -	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); -	if (IS_ERR(em)) { -		ret = PTR_ERR(em); -		goto unlock_err; -	} - -	/* -	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered -	 * io.  INLINE is special, and we could probably kludge it in here, but -	 * it's still buffered so for safety lets just fall back to the generic -	 * buffered path. -	 * -	 * For COMPRESSED we _have_ to read the entire extent in so we can -	 * decompress it, so there will be buffering required no matter what we -	 * do, so go ahead and fallback to buffered. -	 * -	 * We return -ENOTBLK because that's what makes DIO go ahead and go back -	 * to buffered IO.  Don't blame me, this is the price we pay for using -	 * the generic code. -	 */ -	if (extent_map_is_compressed(em) || -	    em->block_start == EXTENT_MAP_INLINE) { -		free_extent_map(em); -		/* -		 * If we are in a NOWAIT context, return -EAGAIN in order to -		 * fallback to buffered IO. This is not only because we can -		 * block with buffered IO (no support for NOWAIT semantics at -		 * the moment) but also to avoid returning short reads to user -		 * space - this happens if we were able to read some data from -		 * previous non-compressed extents and then when we fallback to -		 * buffered IO, at btrfs_file_read_iter() by calling -		 * filemap_read(), we fail to fault in pages for the read buffer, -		 * in which case filemap_read() returns a short read (the number -		 * of bytes previously read is > 0, so it does not return -EFAULT). -		 */ -		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; -		goto unlock_err; -	} - -	len = min(len, em->len - (start - em->start)); - -	/* -	 * If we have a NOWAIT request and the range contains multiple extents -	 * (or a mix of extents and holes), then we return -EAGAIN to make the -	 * caller fallback to a context where it can do a blocking (without -	 * NOWAIT) request. This way we avoid doing partial IO and returning -	 * success to the caller, which is not optimal for writes and for reads -	 * it can result in unexpected behaviour for an application. -	 * -	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling -	 * iomap_dio_rw(), we can end up returning less data then what the caller -	 * asked for, resulting in an unexpected, and incorrect, short read. -	 * That is, the caller asked to read N bytes and we return less than that, -	 * which is wrong unless we are crossing EOF. This happens if we get a -	 * page fault error when trying to fault in pages for the buffer that is -	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we -	 * have previously submitted bios for other extents in the range, in -	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of -	 * those bios have completed by the time we get the page fault error, -	 * which we return back to our caller - we should only return EIOCBQUEUED -	 * after we have submitted bios for all the extents in the range. -	 */ -	if ((flags & IOMAP_NOWAIT) && len < length) { -		free_extent_map(em); -		ret = -EAGAIN; -		goto unlock_err; -	} - -	if (write) { -		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, -						    start, &len, flags); -		if (ret < 0) -			goto unlock_err; -		unlock_extents = true; -		/* Recalc len in case the new em is smaller than requested */ -		len = min(len, em->len - (start - em->start)); -		if (dio_data->data_space_reserved) { -			u64 release_offset; -			u64 release_len = 0; - -			if (dio_data->nocow_done) { -				release_offset = start; -				release_len = data_alloc_len; -			} else if (len < data_alloc_len) { -				release_offset = start + len; -				release_len = data_alloc_len - len; -			} - -			if (release_len > 0) -				btrfs_free_reserved_data_space(BTRFS_I(inode), -							       dio_data->data_reserved, -							       release_offset, -							       release_len); -		} -	} else { -		/* -		 * We need to unlock only the end area that we aren't using. -		 * The rest is going to be unlocked by the endio routine. -		 */ -		lockstart = start + len; -		if (lockstart < lockend) -			unlock_extents = true; -	} - -	if (unlock_extents) -		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, -			      &cached_state); -	else -		free_extent_state(cached_state); - -	/* -	 * Translate extent map information to iomap. -	 * We trim the extents (and move the addr) even though iomap code does -	 * that, since we have locked only the parts we are performing I/O in. -	 */ -	if ((em->block_start == EXTENT_MAP_HOLE) || -	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { -		iomap->addr = IOMAP_NULL_ADDR; -		iomap->type = IOMAP_HOLE; -	} else { -		iomap->addr = em->block_start + (start - em->start); -		iomap->type = IOMAP_MAPPED; -	} -	iomap->offset = start; -	iomap->bdev = fs_info->fs_devices->latest_dev->bdev; -	iomap->length = len; -	free_extent_map(em); - -	return 0; - -unlock_err: -	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, -		      &cached_state); -err: -	if (dio_data->data_space_reserved) { -		btrfs_free_reserved_data_space(BTRFS_I(inode), -					       dio_data->data_reserved, -					       start, data_alloc_len); -		extent_changeset_free(dio_data->data_reserved); -	} - -	return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, -		ssize_t written, unsigned int flags, struct iomap *iomap) -{ -	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); -	struct btrfs_dio_data *dio_data = iter->private; -	size_t submitted = dio_data->submitted; -	const bool write = !!(flags & IOMAP_WRITE); -	int ret = 0; - -	if (!write && (iomap->type == IOMAP_HOLE)) { -		/* If reading from a hole, unlock and return */ -		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, -			      NULL); -		return 0; -	} - -	if (submitted < length) { -		pos += submitted; -		length -= submitted; -		if (write) -			btrfs_finish_ordered_extent(dio_data->ordered, NULL, -						    pos, length, false); -		else -			unlock_extent(&BTRFS_I(inode)->io_tree, pos, -				      pos + length - 1, NULL); -		ret = -ENOTBLK; -	} -	if (write) { -		btrfs_put_ordered_extent(dio_data->ordered); -		dio_data->ordered = NULL; -	} - -	if (write) -		extent_changeset_free(dio_data->data_reserved); -	return ret; -} - -static void btrfs_dio_end_io(struct btrfs_bio *bbio) -{ -	struct btrfs_dio_private *dip = -		container_of(bbio, struct btrfs_dio_private, bbio); -	struct btrfs_inode *inode = bbio->inode; -	struct bio *bio = &bbio->bio; - -	if (bio->bi_status) { -		btrfs_warn(inode->root->fs_info, -		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", -			   btrfs_ino(inode), bio->bi_opf, -			   dip->file_offset, dip->bytes, bio->bi_status); -	} - -	if (btrfs_op(bio) == BTRFS_MAP_WRITE) { -		btrfs_finish_ordered_extent(bbio->ordered, NULL, -					    dip->file_offset, dip->bytes, -					    !bio->bi_status); -	} else { -		unlock_extent(&inode->io_tree, dip->file_offset, -			      dip->file_offset + dip->bytes - 1, NULL); -	} - -	bbio->bio.bi_private = bbio->private; -	iomap_dio_bio_end_io(bio); -} - -static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, -				loff_t file_offset) -{ -	struct btrfs_bio *bbio = btrfs_bio(bio); -	struct btrfs_dio_private *dip = -		container_of(bbio, struct btrfs_dio_private, bbio); -	struct btrfs_dio_data *dio_data = iter->private; - -	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, -		       btrfs_dio_end_io, bio->bi_private); -	bbio->inode = BTRFS_I(iter->inode); -	bbio->file_offset = file_offset; - -	dip->file_offset = file_offset; -	dip->bytes = bio->bi_iter.bi_size; - -	dio_data->submitted += bio->bi_iter.bi_size; - -	/* -	 * Check if we are doing a partial write.  If we are, we need to split -	 * the ordered extent to match the submitted bio.  Hang on to the -	 * remaining unfinishable ordered_extent in dio_data so that it can be -	 * cancelled in iomap_end to avoid a deadlock wherein faulting the -	 * remaining pages is blocked on the outstanding ordered extent. -	 */ -	if (iter->flags & IOMAP_WRITE) { -		int ret; - -		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); -		if (ret) { -			btrfs_finish_ordered_extent(dio_data->ordered, NULL, -						    file_offset, dip->bytes, -						    !ret); -			bio->bi_status = errno_to_blk_status(ret); -			iomap_dio_bio_end_io(bio); -			return; -		} -	} - -	btrfs_submit_bio(bbio, 0); -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { -	.iomap_begin            = btrfs_dio_iomap_begin, -	.iomap_end              = btrfs_dio_iomap_end, -}; - -static const struct iomap_dio_ops btrfs_dio_ops = { -	.submit_io		= btrfs_dio_submit_io, -	.bio_set		= &btrfs_dio_bioset, -}; - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) -{ -	struct btrfs_dio_data data = { 0 }; - -	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, -			    IOMAP_DIO_PARTIAL, &data, done_before); -} - -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, -				  size_t done_before) -{ -	struct btrfs_dio_data data = { 0 }; - -	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, -			    IOMAP_DIO_PARTIAL, &data, done_before); -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -			u64 start, u64 len) -{ -	struct btrfs_inode *btrfs_inode = BTRFS_I(inode); -	int	ret; - -	ret = fiemap_prep(inode, fieinfo, start, &len, 0); -	if (ret) -		return ret; - -	/* -	 * fiemap_prep() called filemap_write_and_wait() for the whole possible -	 * file range (0 to LLONG_MAX), but that is not enough if we have -	 * compression enabled. The first filemap_fdatawrite_range() only kicks -	 * in the compression of data (in an async thread) and will return -	 * before the compression is done and writeback is started. A second -	 * filemap_fdatawrite_range() is needed to wait for the compression to -	 * complete and writeback to start. We also need to wait for ordered -	 * extents to complete, because our fiemap implementation uses mainly -	 * file extent items to list the extents, searching for extent maps -	 * only for file ranges with holes or prealloc extents to figure out -	 * if we have delalloc in those ranges. -	 */ -	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { -		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); -		if (ret) -			return ret; -	} - -	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); - -	/* -	 * We did an initial flush to avoid holding the inode's lock while -	 * triggering writeback and waiting for the completion of IO and ordered -	 * extents. Now after we locked the inode we do it again, because it's -	 * possible a new write may have happened in between those two steps. -	 */ -	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { -		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); -		if (ret) { -			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); -			return ret; -		} -	} - -	ret = extent_fiemap(btrfs_inode, fieinfo, start, len); -	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); - -	return ret; -} -  /*   * For release_folio() and invalidate_folio() we have a race window where   * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -8198,7 +7420,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)  	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);  	if (!skip_writeback) { -		ret = btrfs_wait_ordered_range(&inode->vfs_inode, +		ret = btrfs_wait_ordered_range(inode,  					       inode->vfs_inode.i_size & (~mask),  					       (u64)-1);  		if (ret) @@ -8399,20 +7621,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);  	struct btrfs_inode *ei;  	struct inode *inode; -	struct extent_io_tree *file_extent_tree = NULL; - -	/* Self tests may pass a NULL fs_info. */ -	if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { -		file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); -		if (!file_extent_tree) -			return NULL; -	}  	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); -	if (!ei) { -		kfree(file_extent_tree); +	if (!ei)  		return NULL; -	}  	ei->root = NULL;  	ei->generation = 0; @@ -8425,8 +7637,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->disk_i_size = 0;  	ei->flags = 0;  	ei->ro_flags = 0; +	/* +	 * ->index_cnt will be properly initialized later when creating a new +	 * inode (btrfs_create_new_inode()) or when reading an existing inode +	 * from disk (btrfs_read_locked_inode()). +	 */  	ei->csum_bytes = 0; -	ei->index_cnt = (u64)-1;  	ei->dir_index = 0;  	ei->last_unlink_trans = 0;  	ei->last_reflink_trans = 0; @@ -8453,20 +7669,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);  	ei->io_tree.inode = ei; -	ei->file_extent_tree = file_extent_tree; -	if (file_extent_tree) { -		extent_io_tree_init(fs_info, ei->file_extent_tree, -				    IO_TREE_INODE_FILE_EXTENT); -		/* Lockdep class is set only for the file extent tree. */ -		lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); -	} +	ei->file_extent_tree = NULL; +  	mutex_init(&ei->log_mutex);  	spin_lock_init(&ei->ordered_tree_lock);  	ei->ordered_tree = RB_ROOT;  	ei->ordered_tree_last = NULL;  	INIT_LIST_HEAD(&ei->delalloc_inodes);  	INIT_LIST_HEAD(&ei->delayed_iput); -	RB_CLEAR_NODE(&ei->rb_node);  	init_rwsem(&ei->i_mmap_lock);  	return inode; @@ -8502,9 +7712,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)  	if (!S_ISDIR(vfs_inode->i_mode)) {  		WARN_ON(inode->delalloc_bytes);  		WARN_ON(inode->new_delalloc_bytes); +		WARN_ON(inode->csum_bytes);  	} -	WARN_ON(inode->csum_bytes); -	WARN_ON(inode->defrag_bytes); +	if (!root || !btrfs_is_data_reloc_root(root)) +		WARN_ON(inode->defrag_bytes);  	/*  	 * This can happen where we create an inode, but somebody else also @@ -8538,7 +7749,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)  		}  	}  	btrfs_qgroup_check_reserved_leak(inode); -	inode_tree_del(inode); +	btrfs_del_inode_from_root(inode);  	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);  	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);  	btrfs_put_root(inode->root); @@ -8572,7 +7783,6 @@ void __cold btrfs_destroy_cachep(void)  	 * destroy cache.  	 */  	rcu_barrier(); -	bioset_exit(&btrfs_dio_bioset);  	kmem_cache_destroy(btrfs_inode_cachep);  } @@ -8583,17 +7793,9 @@ int __init btrfs_init_cachep(void)  			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,  			init_once);  	if (!btrfs_inode_cachep) -		goto fail; - -	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, -			offsetof(struct btrfs_dio_private, bbio.bio), -			BIOSET_NEED_BVECS)) -		goto fail; +		return -ENOMEM;  	return 0; -fail: -	btrfs_destroy_cachep(); -	return -ENOMEM;  }  static int btrfs_getattr(struct mnt_idmap *idmap, @@ -9586,11 +8788,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		}  		em->start = cur_offset; -		em->orig_start = cur_offset;  		em->len = ins.offset; -		em->block_start = ins.objectid; -		em->block_len = ins.offset; -		em->orig_block_len = ins.offset; +		em->disk_bytenr = ins.objectid; +		em->offset = 0; +		em->disk_num_bytes = ins.offset;  		em->ram_bytes = ins.offset;  		em->flags |= EXTENT_FLAG_PREALLOC;  		em->generation = trans->transid; @@ -9956,7 +9157,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);  	if (!pages)  		return -ENOMEM; -	ret = btrfs_alloc_page_array(nr_pages, pages, 0); +	ret = btrfs_alloc_page_array(nr_pages, pages, false);  	if (ret) {  		ret = -ENOMEM;  		goto out; @@ -10033,7 +9234,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,  	for (;;) {  		struct btrfs_ordered_extent *ordered; -		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, +		ret = btrfs_wait_ordered_range(inode, start,  					       lockend - start + 1);  		if (ret)  			goto out_unlock_inode; @@ -10053,7 +9254,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,  		goto out_unlock_extent;  	} -	if (em->block_start == EXTENT_MAP_INLINE) { +	if (em->disk_bytenr == EXTENT_MAP_INLINE) {  		u64 extent_start = em->start;  		/* @@ -10074,33 +9275,33 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,  	 */  	encoded->len = min_t(u64, extent_map_end(em),  			     inode->vfs_inode.i_size) - iocb->ki_pos; -	if (em->block_start == EXTENT_MAP_HOLE || +	if (em->disk_bytenr == EXTENT_MAP_HOLE ||  	    (em->flags & EXTENT_FLAG_PREALLOC)) {  		disk_bytenr = EXTENT_MAP_HOLE;  		count = min_t(u64, count, encoded->len);  		encoded->len = count;  		encoded->unencoded_len = count;  	} else if (extent_map_is_compressed(em)) { -		disk_bytenr = em->block_start; +		disk_bytenr = em->disk_bytenr;  		/*  		 * Bail if the buffer isn't large enough to return the whole  		 * compressed extent.  		 */ -		if (em->block_len > count) { +		if (em->disk_num_bytes > count) {  			ret = -ENOBUFS;  			goto out_em;  		} -		disk_io_size = em->block_len; -		count = em->block_len; +		disk_io_size = em->disk_num_bytes; +		count = em->disk_num_bytes;  		encoded->unencoded_len = em->ram_bytes; -		encoded->unencoded_offset = iocb->ki_pos - em->orig_start; +		encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);  		ret = btrfs_encoded_io_compression_from_extent(fs_info,  							       extent_map_compression(em));  		if (ret < 0)  			goto out_em;  		encoded->compression = ret;  	} else { -		disk_bytenr = em->block_start + (start - em->start); +		disk_bytenr = extent_map_block_start(em) + (start - em->start);  		if (encoded->len > count)  			encoded->len = count;  		/* @@ -10155,6 +9356,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	struct extent_changeset *data_reserved = NULL;  	struct extent_state *cached_state = NULL;  	struct btrfs_ordered_extent *ordered; +	struct btrfs_file_extent file_extent;  	int compression;  	size_t orig_count;  	u64 start, end; @@ -10276,7 +9478,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  	for (;;) {  		struct btrfs_ordered_extent *ordered; -		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); +		ret = btrfs_wait_ordered_range(inode, start, num_bytes);  		if (ret)  			goto out_folios;  		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, @@ -10330,22 +9532,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,  		goto out_delalloc_release;  	extent_reserved = true; -	em = create_io_em(inode, start, num_bytes, -			  start - encoded->unencoded_offset, ins.objectid, -			  ins.offset, ins.offset, ram_bytes, compression, -			  BTRFS_ORDERED_COMPRESSED); +	file_extent.disk_bytenr = ins.objectid; +	file_extent.disk_num_bytes = ins.offset; +	file_extent.num_bytes = num_bytes; +	file_extent.ram_bytes = ram_bytes; +	file_extent.offset = encoded->unencoded_offset; +	file_extent.compression = compression; +	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);  	if (IS_ERR(em)) {  		ret = PTR_ERR(em);  		goto out_free_reserved;  	}  	free_extent_map(em); -	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, -				       ins.objectid, ins.offset, -				       encoded->unencoded_offset, +	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,  				       (1 << BTRFS_ORDERED_ENCODED) | -				       (1 << BTRFS_ORDERED_COMPRESSED), -				       compression); +				       (1 << BTRFS_ORDERED_COMPRESSED));  	if (IS_ERR(ordered)) {  		btrfs_drop_extent_map_range(inode, start, end, false);  		ret = PTR_ERR(ordered); @@ -10385,7 +9587,7 @@ out_unlock:  out_folios:  	for (i = 0; i < nr_folios; i++) {  		if (folios[i]) -			__folio_put(folios[i]); +			folio_put(folios[i]);  	}  	kvfree(folios);  out: @@ -10549,7 +9751,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,  	 * file changes again after this, the user is doing something stupid and  	 * we don't really care.  	 */ -	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); +	ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);  	if (ret)  		return ret; @@ -10635,12 +9837,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,  			goto out;  		} -		if (em->block_start == EXTENT_MAP_HOLE) { +		if (em->disk_bytenr == EXTENT_MAP_HOLE) {  			btrfs_warn(fs_info, "swapfile must not have holes");  			ret = -EINVAL;  			goto out;  		} -		if (em->block_start == EXTENT_MAP_INLINE) { +		if (em->disk_bytenr == EXTENT_MAP_INLINE) {  			/*  			 * It's unlikely we'll ever actually find ourselves  			 * here, as a file small enough to fit inline won't be @@ -10658,12 +9860,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,  			goto out;  		} -		logical_block_start = em->block_start + (start - em->start); +		logical_block_start = extent_map_block_start(em) + (start - em->start);  		len = min(len, em->len - (start - em->start));  		free_extent_map(em);  		em = NULL; -		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); +		ret = can_nocow_extent(inode, start, &len, NULL, false, true);  		if (ret < 0) {  			goto out;  		} else if (ret) { @@ -10860,52 +10062,23 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en   */  struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)  { -	struct rb_node *node; -	struct rb_node *prev;  	struct btrfs_inode *inode; +	unsigned long from = min_ino; -	spin_lock(&root->inode_lock); -again: -	node = root->inode_tree.rb_node; -	prev = NULL; -	while (node) { -		prev = node; -		inode = rb_entry(node, struct btrfs_inode, rb_node); -		if (min_ino < btrfs_ino(inode)) -			node = node->rb_left; -		else if (min_ino > btrfs_ino(inode)) -			node = node->rb_right; -		else +	xa_lock(&root->inodes); +	while (true) { +		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); +		if (!inode) +			break; +		if (igrab(&inode->vfs_inode))  			break; -	} - -	if (!node) { -		while (prev) { -			inode = rb_entry(prev, struct btrfs_inode, rb_node); -			if (min_ino <= btrfs_ino(inode)) { -				node = prev; -				break; -			} -			prev = rb_next(prev); -		} -	} - -	while (node) { -		inode = rb_entry(prev, struct btrfs_inode, rb_node); -		if (igrab(&inode->vfs_inode)) { -			spin_unlock(&root->inode_lock); -			return inode; -		} - -		min_ino = btrfs_ino(inode) + 1; -		if (cond_resched_lock(&root->inode_lock)) -			goto again; -		node = rb_next(node); +		from = btrfs_ino(inode) + 1; +		cond_resched_lock(&root->inodes.xa_lock);  	} -	spin_unlock(&root->inode_lock); +	xa_unlock(&root->inodes); -	return NULL; +	return inode;  }  static const struct inode_operations btrfs_dir_inode_operations = { |