diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 700 | 
1 files changed, 121 insertions, 579 deletions
| diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98a800b8bd43..957e4d76a7b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,27 +84,12 @@ struct btrfs_dio_data {  };  struct btrfs_dio_private { -	struct btrfs_inode *inode; - -	/* -	 * Since DIO can use anonymous page, we cannot use page_offset() to -	 * grab the file offset, thus need a dedicated member for file offset. -	 */ +	/* Range of I/O */  	u64 file_offset; -	/* Used for bio::bi_size */  	u32 bytes; -	/* -	 * References to this structure. There is one reference per in-flight -	 * bio plus one while we're still setting up. -	 */ -	refcount_t refs; - -	/* Array of checksums */ -	u8 *csums; -  	/* This must be last */ -	struct bio bio; +	struct btrfs_bio bbio;  };  static struct bio_set btrfs_dio_bioset; @@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,  {  	unsigned long index = offset >> PAGE_SHIFT;  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; -	u64 page_start, page_end; +	u64 page_start = 0, page_end = 0;  	struct page *page;  	if (locked_page) { @@ -2536,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,  }  /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time.   All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) -{ -	return btrfs_csum_one_bio(inode, bio, (u64)-1, false); -} - -/*   * Split an extent_map at [start, start + len]   *   * This function is intended to be used only for extract_ordered_extent(). @@ -2663,19 +2635,19 @@ out:  	return ret;  } -static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, -					   struct bio *bio, loff_t file_offset) +blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)  { +	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; +	u64 len = bbio->bio.bi_iter.bi_size; +	struct btrfs_inode *inode = bbio->inode;  	struct btrfs_ordered_extent *ordered; -	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;  	u64 file_len; -	u64 len = bio->bi_iter.bi_size;  	u64 end = start + len;  	u64 ordered_end;  	u64 pre, post;  	int ret = 0; -	ordered = btrfs_lookup_ordered_extent(inode, file_offset); +	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);  	if (WARN_ON_ONCE(!ordered))  		return BLK_STS_IOERR; @@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,  	ret = btrfs_split_ordered_extent(ordered, pre, post);  	if (ret)  		goto out; -	ret = split_zoned_em(inode, file_offset, file_len, pre, post); +	ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);  out:  	btrfs_put_ordered_extent(ordered); @@ -2723,75 +2695,6 @@ out:  	return errno_to_blk_status(ret);  } -void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ -	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	blk_status_t ret; - -	if (bio_op(bio) == REQ_OP_ZONE_APPEND) { -		ret = extract_ordered_extent(inode, bio, -				page_offset(bio_first_bvec_all(bio)->bv_page)); -		if (ret) { -			btrfs_bio_end_io(btrfs_bio(bio), ret); -			return; -		} -	} - -	/* -	 * If we need to checksum, and the I/O is not issued by fsync and -	 * friends, that is ->sync_writers != 0, defer the submission to a -	 * workqueue to parallelize it. -	 * -	 * Csum items for reloc roots have already been cloned at this point, -	 * so they are handled as part of the no-checksum case. -	 */ -	if (!(inode->flags & BTRFS_INODE_NODATASUM) && -	    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && -	    !btrfs_is_data_reloc_root(inode->root)) { -		if (!atomic_read(&inode->sync_writers) && -		    btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) -			return; - -		ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); -		if (ret) { -			btrfs_bio_end_io(btrfs_bio(bio), ret); -			return; -		} -	} -	btrfs_submit_bio(fs_info, bio, mirror_num); -} - -void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, -			int mirror_num, enum btrfs_compression_type compress_type) -{ -	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	blk_status_t ret; - -	if (compress_type != BTRFS_COMPRESS_NONE) { -		/* -		 * btrfs_submit_compressed_read will handle completing the bio -		 * if there were any errors, so just return here. -		 */ -		btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); -		return; -	} - -	/* Save the original iter for read repair */ -	btrfs_bio(bio)->iter = bio->bi_iter; - -	/* -	 * Lookup bio sums does extra checks around whether we need to csum or -	 * not, which is why we ignore skip_sum here. -	 */ -	ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); -	if (ret) { -		btrfs_bio_end_io(btrfs_bio(bio), ret); -		return; -	} - -	btrfs_submit_bio(fs_info, bio, mirror_num); -} -  /*   * given a list of ordered sums record them in the inode.  This happens   * at IO completion time based on sums calculated at bio submission time. @@ -2969,7 +2872,7 @@ again:  		unlock_extent(&inode->io_tree, page_start, page_end,  			      &cached_state);  		unlock_page(page); -		btrfs_start_ordered_extent(ordered, 1); +		btrfs_start_ordered_extent(ordered);  		btrfs_put_ordered_extent(ordered);  		goto again;  	} @@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  		goto out;  	} -	/* A valid bdev implies a write on a sequential zone */ -	if (ordered_extent->bdev) { +	/* A valid ->physical implies a write on a sequential zone. */ +	if (ordered_extent->physical != (u64)-1) {  		btrfs_rewrite_logical_zoned(ordered_extent);  		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,  					ordered_extent->disk_num_bytes);  	} -	btrfs_free_io_failure_record(inode, start, end); -  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {  		truncated = true;  		logical_len = ordered_extent->truncated_len; @@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of  }  /* - * check_data_csum - verify checksum of one sector of uncompressed data - * @inode:	inode - * @bbio:	btrfs_bio which contains the csum + * Verify the checksum of a single data sector. + * + * @bbio:	btrfs_io_bio which contains the csum + * @dev:	device the sector is on   * @bio_offset:	offset to the beginning of the bio (in bytes) - * @page:	page where is the data to be verified - * @pgoff:	offset inside the page + * @bv:		bio_vec to check   * - * The length of such check is always one sector size. + * Check if the checksum on a data block is valid.  When a checksum mismatch is + * detected, report the error and fill the corrupted range with zero.   * - * When csum mismatch is detected, we will also report the error and fill the - * corrupted range with zero. (Thus it needs the extra parameters) + * Return %true if the sector is ok or had no checksum to start with, else %false.   */ -int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, -			  u32 bio_offset, struct page *page, u32 pgoff) +bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, +			u32 bio_offset, struct bio_vec *bv)  { +	struct btrfs_inode *inode = bbio->inode;  	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	u32 len = fs_info->sectorsize; +	u64 file_offset = bbio->file_offset + bio_offset; +	u64 end = file_offset + bv->bv_len - 1;  	u8 *csum_expected;  	u8 csum[BTRFS_CSUM_SIZE]; -	ASSERT(pgoff + len <= PAGE_SIZE); +	ASSERT(bv->bv_len == fs_info->sectorsize); -	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); +	if (!bbio->csum) +		return true; + +	if (btrfs_is_data_reloc_root(inode->root) && +	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, +			   1, NULL)) { +		/* Skip the range without csum for data reloc inode */ +		clear_extent_bits(&inode->io_tree, file_offset, end, +				  EXTENT_NODATASUM); +		return true; +	} -	if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) +	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); +	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, +				    csum_expected))  		goto zeroit; -	return 0; +	return true;  zeroit: -	btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, -				    csum, csum_expected, bbio->mirror_num); -	if (bbio->device) -		btrfs_dev_stat_inc_and_print(bbio->device, -					     BTRFS_DEV_STAT_CORRUPTION_ERRS); -	memzero_page(page, pgoff, len); -	return -EIO; -} - -/* - * When reads are done, we need to check csums to verify the data is correct. - * if there's a match, we allow the bio to finish.  If not, the code in - * extent_io.c will try to find good copies for us. - * - * @bio_offset:	offset to the beginning of the bio (in bytes) - * @start:	file offset of the range start - * @end:	file offset of the range end (inclusive) - * - * Return a bitmap where bit set means a csum mismatch, and bit not set means - * csum match. - */ -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, -				    u32 bio_offset, struct page *page, -				    u64 start, u64 end) -{ -	struct btrfs_inode *inode = BTRFS_I(page->mapping->host); -	struct btrfs_root *root = inode->root; -	struct btrfs_fs_info *fs_info = root->fs_info; -	struct extent_io_tree *io_tree = &inode->io_tree; -	const u32 sectorsize = root->fs_info->sectorsize; -	u32 pg_off; -	unsigned int result = 0; - -	/* -	 * This only happens for NODATASUM or compressed read. -	 * Normally this should be covered by above check for compressed read -	 * or the next check for NODATASUM.  Just do a quicker exit here. -	 */ -	if (bbio->csum == NULL) -		return 0; - -	if (inode->flags & BTRFS_INODE_NODATASUM) -		return 0; - -	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) -		return 0; - -	ASSERT(page_offset(page) <= start && -	       end <= page_offset(page) + PAGE_SIZE - 1); -	for (pg_off = offset_in_page(start); -	     pg_off < offset_in_page(end); -	     pg_off += sectorsize, bio_offset += sectorsize) { -		u64 file_offset = pg_off + page_offset(page); -		int ret; - -		if (btrfs_is_data_reloc_root(root) && -		    test_range_bit(io_tree, file_offset, -				   file_offset + sectorsize - 1, -				   EXTENT_NODATASUM, 1, NULL)) { -			/* Skip the range without csum for data reloc inode */ -			clear_extent_bits(io_tree, file_offset, -					  file_offset + sectorsize - 1, -					  EXTENT_NODATASUM); -			continue; -		} -		ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); -		if (ret < 0) { -			const int nr_bit = (pg_off - offset_in_page(start)) >> -				     root->fs_info->sectorsize_bits; - -			result |= (1U << nr_bit); -		} -	} -	return result; +	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, +				    bbio->mirror_num); +	if (dev) +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); +	memzero_bvec(bv); +	return false;  }  /* @@ -4987,7 +4834,7 @@ again:  		unlock_extent(io_tree, block_start, block_end, &cached_state);  		unlock_page(page);  		put_page(page); -		btrfs_start_ordered_extent(ordered, 1); +		btrfs_start_ordered_extent(ordered);  		btrfs_put_ordered_extent(ordered);  		goto again;  	} @@ -5281,7 +5128,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)  	return ret;  } -static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, +static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,  			 struct iattr *attr)  {  	struct inode *inode = d_inode(dentry); @@ -5291,7 +5138,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr  	if (btrfs_root_readonly(root))  		return -EROFS; -	err = setattr_prepare(mnt_userns, dentry, attr); +	err = setattr_prepare(idmap, dentry, attr);  	if (err)  		return err; @@ -5302,12 +5149,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr  	}  	if (attr->ia_valid) { -		setattr_copy(mnt_userns, inode, attr); +		setattr_copy(idmap, inode, attr);  		inode_inc_iversion(inode);  		err = btrfs_dirty_inode(BTRFS_I(inode));  		if (!err && attr->ia_valid & ATTR_MODE) -			err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode); +			err = posix_acl_chmod(idmap, dentry, inode->i_mode);  	}  	return err; @@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode)  	if (is_bad_inode(inode))  		goto no_delete; -	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); -  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))  		goto no_delete; @@ -5576,8 +5421,13 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,  		return -ENOMEM;  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); -	if (ret) +	if (ret < 0)  		goto out; +	/* +	 * fscrypt_setup_filename() should never return a positive value, but +	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen. +	 */ +	ASSERT(ret == 0);  	/* This needs to handle no-key deletions later on */ @@ -6724,7 +6574,7 @@ out_inode:  	return err;  } -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,  		       struct dentry *dentry, umode_t mode, dev_t rdev)  {  	struct inode *inode; @@ -6732,13 +6582,13 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,  	inode = new_inode(dir->i_sb);  	if (!inode)  		return -ENOMEM; -	inode_init_owner(mnt_userns, inode, dir, mode); +	inode_init_owner(idmap, inode, dir, mode);  	inode->i_op = &btrfs_special_inode_operations;  	init_special_inode(inode, inode->i_mode, rdev);  	return btrfs_create_common(dir, dentry, inode);  } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,  			struct dentry *dentry, umode_t mode, bool excl)  {  	struct inode *inode; @@ -6746,7 +6596,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,  	inode = new_inode(dir->i_sb);  	if (!inode)  		return -ENOMEM; -	inode_init_owner(mnt_userns, inode, dir, mode); +	inode_init_owner(idmap, inode, dir, mode);  	inode->i_fop = &btrfs_file_operations;  	inode->i_op = &btrfs_file_inode_operations;  	inode->i_mapping->a_ops = &btrfs_aops; @@ -6837,7 +6687,7 @@ fail:  	return err;  } -static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,  		       struct dentry *dentry, umode_t mode)  {  	struct inode *inode; @@ -6845,7 +6695,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,  	inode = new_inode(dir->i_sb);  	if (!inode)  		return -ENOMEM; -	inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); +	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);  	inode->i_op = &btrfs_dir_inode_operations;  	inode->i_fop = &btrfs_dir_file_operations;  	return btrfs_create_common(dir, dentry, inode); @@ -7392,7 +7242,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  			 */  			if (writing ||  			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) -				btrfs_start_ordered_extent(ordered, 1); +				btrfs_start_ordered_extent(ordered);  			else  				ret = nowait ? -EAGAIN : -ENOTBLK;  			btrfs_put_ordered_extent(ordered); @@ -7833,10 +7683,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,  	iomap->offset = start;  	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;  	iomap->length = len; - -	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) -		iomap->flags |= IOMAP_F_ZONE_APPEND; -  	free_extent_map(em);  	return 0; @@ -7888,267 +7734,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,  	return ret;  } -static void btrfs_dio_private_put(struct btrfs_dio_private *dip) -{ -	/* -	 * This implies a barrier so that stores to dio_bio->bi_status before -	 * this and loads of dio_bio->bi_status after this are fully ordered. -	 */ -	if (!refcount_dec_and_test(&dip->refs)) -		return; - -	if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { -		btrfs_mark_ordered_io_finished(dip->inode, NULL, -					       dip->file_offset, dip->bytes, -					       !dip->bio.bi_status); -	} else { -		unlock_extent(&dip->inode->io_tree, -			      dip->file_offset, -			      dip->file_offset + dip->bytes - 1, NULL); -	} - -	kfree(dip->csums); -	bio_endio(&dip->bio); -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) -{ -	struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - -	BUG_ON(bio_op(bio) == REQ_OP_WRITE); - -	refcount_inc(&dip->refs); -	btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); -} - -static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, -					     struct btrfs_bio *bbio, -					     const bool uptodate) +static void btrfs_dio_end_io(struct btrfs_bio *bbio)  { -	struct inode *inode = &dip->inode->vfs_inode; -	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; -	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); -	blk_status_t err = BLK_STS_OK; -	struct bvec_iter iter; -	struct bio_vec bv; -	u32 offset; - -	btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { -		u64 start = bbio->file_offset + offset; - -		if (uptodate && -		    (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, -						     bv.bv_page, bv.bv_offset))) { -			btrfs_clean_io_failure(BTRFS_I(inode), start, -					       bv.bv_page, bv.bv_offset); -		} else { -			int ret; - -			ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, -					bv.bv_page, bv.bv_offset, false); -			if (ret) -				err = errno_to_blk_status(ret); -		} -	} - -	return err; -} - -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, -					      struct bio *bio, -					      u64 dio_file_offset) -{ -	return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - -static void btrfs_end_dio_bio(struct btrfs_bio *bbio) -{ -	struct btrfs_dio_private *dip = bbio->private; +	struct btrfs_dio_private *dip = +		container_of(bbio, struct btrfs_dio_private, bbio); +	struct btrfs_inode *inode = bbio->inode;  	struct bio *bio = &bbio->bio; -	blk_status_t err = bio->bi_status; - -	if (err) -		btrfs_warn(dip->inode->root->fs_info, -			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", -			   btrfs_ino(dip->inode), bio_op(bio), -			   bio->bi_opf, bio->bi_iter.bi_sector, -			   bio->bi_iter.bi_size, err); - -	if (bio_op(bio) == REQ_OP_READ) -		err = btrfs_check_read_dio_bio(dip, bbio, !err); - -	if (err) -		dip->bio.bi_status = err; - -	btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); -	bio_put(bio); -	btrfs_dio_private_put(dip); -} - -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, -				 u64 file_offset, int async_submit) -{ -	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	struct btrfs_dio_private *dip = btrfs_bio(bio)->private; -	blk_status_t ret; - -	/* Save the original iter for read repair */ -	if (btrfs_op(bio) == BTRFS_MAP_READ) -		btrfs_bio(bio)->iter = bio->bi_iter; - -	if (inode->flags & BTRFS_INODE_NODATASUM) -		goto map; +	if (bio->bi_status) { +		btrfs_warn(inode->root->fs_info, +		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", +			   btrfs_ino(inode), bio->bi_opf, +			   dip->file_offset, dip->bytes, bio->bi_status); +	} -	if (btrfs_op(bio) == BTRFS_MAP_WRITE) { -		/* Check btrfs_submit_data_write_bio() for async submit rules */ -		if (async_submit && !atomic_read(&inode->sync_writers) && -		    btrfs_wq_submit_bio(inode, bio, 0, file_offset, -					WQ_SUBMIT_DATA_DIO)) -			return; +	if (btrfs_op(bio) == BTRFS_MAP_WRITE) +		btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, +					       dip->bytes, !bio->bi_status); +	else +		unlock_extent(&inode->io_tree, dip->file_offset, +			      dip->file_offset + dip->bytes - 1, NULL); -		/* -		 * If we aren't doing async submit, calculate the csum of the -		 * bio now. -		 */ -		ret = btrfs_csum_one_bio(inode, bio, file_offset, false); -		if (ret) { -			btrfs_bio_end_io(btrfs_bio(bio), ret); -			return; -		} -	} else { -		btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, -						      file_offset - dip->file_offset); -	} -map: -	btrfs_submit_bio(fs_info, bio, 0); +	bbio->bio.bi_private = bbio->private; +	iomap_dio_bio_end_io(bio);  } -static void btrfs_submit_direct(const struct iomap_iter *iter, -		struct bio *dio_bio, loff_t file_offset) +static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, +				loff_t file_offset)  { +	struct btrfs_bio *bbio = btrfs_bio(bio);  	struct btrfs_dio_private *dip = -		container_of(dio_bio, struct btrfs_dio_private, bio); -	struct inode *inode = iter->inode; -	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	const bool raid56 = (btrfs_data_alloc_profile(fs_info) & -			     BTRFS_BLOCK_GROUP_RAID56_MASK); -	struct bio *bio; -	u64 start_sector; -	int async_submit = 0; -	u64 submit_len; -	u64 clone_offset = 0; -	u64 clone_len; -	u64 logical; -	int ret; -	blk_status_t status; -	struct btrfs_io_geometry geom; +		container_of(bbio, struct btrfs_dio_private, bbio);  	struct btrfs_dio_data *dio_data = iter->private; -	struct extent_map *em = NULL; - -	dip->inode = BTRFS_I(inode); -	dip->file_offset = file_offset; -	dip->bytes = dio_bio->bi_iter.bi_size; -	refcount_set(&dip->refs, 1); -	dip->csums = NULL; - -	if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { -		unsigned int nr_sectors = -			(dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - -		/* -		 * Load the csums up front to reduce csum tree searches and -		 * contention when submitting bios. -		 */ -		status = BLK_STS_RESOURCE; -		dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); -		if (!dip->csums) -			goto out_err; - -		status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); -		if (status != BLK_STS_OK) -			goto out_err; -	} - -	start_sector = dio_bio->bi_iter.bi_sector; -	submit_len = dio_bio->bi_iter.bi_size; - -	do { -		logical = start_sector << 9; -		em = btrfs_get_chunk_map(fs_info, logical, submit_len); -		if (IS_ERR(em)) { -			status = errno_to_blk_status(PTR_ERR(em)); -			em = NULL; -			goto out_err_em; -		} -		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), -					    logical, &geom); -		if (ret) { -			status = errno_to_blk_status(ret); -			goto out_err_em; -		} - -		clone_len = min(submit_len, geom.len); -		ASSERT(clone_len <= UINT_MAX); - -		/* -		 * This will never fail as it's passing GPF_NOFS and -		 * the allocation is backed by btrfs_bioset. -		 */ -		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len, -					      btrfs_end_dio_bio, dip); -		btrfs_bio(bio)->file_offset = file_offset; - -		if (bio_op(bio) == REQ_OP_ZONE_APPEND) { -			status = extract_ordered_extent(BTRFS_I(inode), bio, -							file_offset); -			if (status) { -				bio_put(bio); -				goto out_err; -			} -		} - -		ASSERT(submit_len >= clone_len); -		submit_len -= clone_len; - -		/* -		 * Increase the count before we submit the bio so we know -		 * the end IO handler won't happen before we increase the -		 * count. Otherwise, the dip might get freed before we're -		 * done setting it up. -		 * -		 * We transfer the initial reference to the last bio, so we -		 * don't need to increment the reference count for the last one. -		 */ -		if (submit_len > 0) { -			refcount_inc(&dip->refs); -			/* -			 * If we are submitting more than one bio, submit them -			 * all asynchronously. The exception is RAID 5 or 6, as -			 * asynchronous checksums make it difficult to collect -			 * full stripe writes. -			 */ -			if (!raid56) -				async_submit = 1; -		} - -		btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); -		dio_data->submitted += clone_len; -		clone_offset += clone_len; -		start_sector += clone_len >> 9; -		file_offset += clone_len; +	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); +	bbio->file_offset = file_offset; -		free_extent_map(em); -	} while (submit_len > 0); -	return; +	dip->file_offset = file_offset; +	dip->bytes = bio->bi_iter.bi_size; -out_err_em: -	free_extent_map(em); -out_err: -	dio_bio->bi_status = status; -	btrfs_dio_private_put(dip); +	dio_data->submitted += bio->bi_iter.bi_size; +	btrfs_submit_bio(bio, 0);  }  static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8157,7 +7783,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {  };  static const struct iomap_dio_ops btrfs_dio_ops = { -	.submit_io		= btrfs_submit_direct, +	.submit_io		= btrfs_dio_submit_io,  	.bio_set		= &btrfs_dio_bioset,  }; @@ -8552,7 +8178,7 @@ again:  		unlock_extent(io_tree, page_start, page_end, &cached_state);  		unlock_page(page);  		up_read(&BTRFS_I(inode)->i_mmap_lock); -		btrfs_start_ordered_extent(ordered, 1); +		btrfs_start_ordered_extent(ordered);  		btrfs_put_ordered_extent(ordered);  		goto again;  	} @@ -8802,7 +8428,7 @@ out:  	return ret;  } -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, +struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,  				     struct inode *dir)  {  	struct inode *inode; @@ -8813,7 +8439,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,  		 * Subvolumes don't inherit the sgid bit or the parent's gid if  		 * the parent's sgid bit is set. This is probably a bug.  		 */ -		inode_init_owner(mnt_userns, inode, NULL, +		inode_init_owner(idmap, inode, NULL,  				 S_IFDIR | (~current_umask() & S_IRWXUGO));  		inode->i_op = &btrfs_dir_inode_operations;  		inode->i_fop = &btrfs_dir_file_operations; @@ -8850,7 +8476,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->last_log_commit = 0;  	spin_lock_init(&ei->lock); -	spin_lock_init(&ei->io_failure_lock);  	ei->outstanding_extents = 0;  	if (sb->s_magic != BTRFS_TEST_MAGIC)  		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, @@ -8870,7 +8495,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	ei->io_tree.inode = ei;  	extent_io_tree_init(fs_info, &ei->file_extent_tree,  			    IO_TREE_INODE_FILE_EXTENT); -	ei->io_failure_tree = RB_ROOT;  	atomic_set(&ei->sync_writers, 0);  	mutex_init(&ei->log_mutex);  	btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -8994,7 +8618,7 @@ int __init btrfs_init_cachep(void)  		goto fail;  	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, -			offsetof(struct btrfs_dio_private, bio), +			offsetof(struct btrfs_dio_private, bbio.bio),  			BIOSET_NEED_BVECS))  		goto fail; @@ -9004,7 +8628,7 @@ fail:  	return -ENOMEM;  } -static int btrfs_getattr(struct user_namespace *mnt_userns, +static int btrfs_getattr(struct mnt_idmap *idmap,  			 const struct path *path, struct kstat *stat,  			 u32 request_mask, unsigned int flags)  { @@ -9034,7 +8658,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,  				  STATX_ATTR_IMMUTABLE |  				  STATX_ATTR_NODUMP); -	generic_fillattr(mnt_userns, inode, stat); +	generic_fillattr(idmap, inode, stat);  	stat->dev = BTRFS_I(inode)->root->anon_dev;  	spin_lock(&BTRFS_I(inode)->lock); @@ -9289,14 +8913,14 @@ out_notrans:  	return ret;  } -static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, +static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,  					struct inode *dir)  {  	struct inode *inode;  	inode = new_inode(dir->i_sb);  	if (inode) { -		inode_init_owner(mnt_userns, inode, dir, +		inode_init_owner(idmap, inode, dir,  				 S_IFCHR | WHITEOUT_MODE);  		inode->i_op = &btrfs_special_inode_operations;  		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); @@ -9304,7 +8928,7 @@ static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns,  	return inode;  } -static int btrfs_rename(struct user_namespace *mnt_userns, +static int btrfs_rename(struct mnt_idmap *idmap,  			struct inode *old_dir, struct dentry *old_dentry,  			struct inode *new_dir, struct dentry *new_dentry,  			unsigned int flags) @@ -9376,7 +9000,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,  		filemap_flush(old_inode->i_mapping);  	if (flags & RENAME_WHITEOUT) { -		whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); +		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);  		if (!whiteout_args.inode) {  			ret = -ENOMEM;  			goto out_fscrypt_names; @@ -9545,7 +9169,7 @@ out_fscrypt_names:  	return ret;  } -static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, +static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,  			 struct dentry *old_dentry, struct inode *new_dir,  			 struct dentry *new_dentry, unsigned int flags)  { @@ -9558,7 +9182,7 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di  		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,  					    new_dentry);  	else -		ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, +		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,  				   new_dentry, flags);  	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); @@ -9758,7 +9382,7 @@ out:  	return ret;  } -static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,  			 struct dentry *dentry, const char *symname)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -9786,7 +9410,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,  	inode = new_inode(dir->i_sb);  	if (!inode)  		return -ENOMEM; -	inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); +	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);  	inode->i_op = &btrfs_symlink_inode_operations;  	inode_nohighmem(inode);  	inode->i_mapping->a_ops = &btrfs_aops; @@ -10075,7 +9699,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,  					   min_size, actual_len, alloc_hint, trans);  } -static int btrfs_permission(struct user_namespace *mnt_userns, +static int btrfs_permission(struct mnt_idmap *idmap,  			    struct inode *inode, int mask)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10088,10 +9712,10 @@ static int btrfs_permission(struct user_namespace *mnt_userns,  		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)  			return -EACCES;  	} -	return generic_permission(mnt_userns, inode, mask); +	return generic_permission(idmap, inode, mask);  } -static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,  			 struct file *file, umode_t mode)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -10109,7 +9733,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,  	inode = new_inode(dir->i_sb);  	if (!inode)  		return -ENOMEM; -	inode_init_owner(mnt_userns, inode, dir, mode); +	inode_init_owner(idmap, inode, dir, mode);  	inode->i_fop = &btrfs_file_operations;  	inode->i_op = &btrfs_file_inode_operations;  	inode->i_mapping->a_ops = &btrfs_aops; @@ -10289,65 +9913,13 @@ struct btrfs_encoded_read_private {  	wait_queue_head_t wait;  	atomic_t pending;  	blk_status_t status; -	bool skip_csum;  }; -static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, -					    struct bio *bio, int mirror_num) -{ -	struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private; -	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	blk_status_t ret; - -	if (!priv->skip_csum) { -		ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); -		if (ret) -			return ret; -	} - -	atomic_inc(&priv->pending); -	btrfs_submit_bio(fs_info, bio, mirror_num); -	return BLK_STS_OK; -} - -static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) -{ -	const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); -	struct btrfs_encoded_read_private *priv = bbio->private; -	struct btrfs_inode *inode = priv->inode; -	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	u32 sectorsize = fs_info->sectorsize; -	struct bio_vec *bvec; -	struct bvec_iter_all iter_all; -	u32 bio_offset = 0; - -	if (priv->skip_csum || !uptodate) -		return bbio->bio.bi_status; - -	bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { -		unsigned int i, nr_sectors, pgoff; - -		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); -		pgoff = bvec->bv_offset; -		for (i = 0; i < nr_sectors; i++) { -			ASSERT(pgoff < PAGE_SIZE); -			if (btrfs_check_data_csum(inode, bbio, bio_offset, -					    bvec->bv_page, pgoff)) -				return BLK_STS_IOERR; -			bio_offset += sectorsize; -			pgoff += sectorsize; -		} -	} -	return BLK_STS_OK; -} -  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)  {  	struct btrfs_encoded_read_private *priv = bbio->private; -	blk_status_t status; -	status = btrfs_encoded_read_verify_csum(bbio); -	if (status) { +	if (bbio->bio.bi_status) {  		/*  		 * The memory barrier implied by the atomic_dec_return() here  		 * pairs with the memory barrier implied by the @@ -10356,11 +9928,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)  		 * write is observed before the load of status in  		 * btrfs_encoded_read_regular_fill_pages().  		 */ -		WRITE_ONCE(priv->status, status); +		WRITE_ONCE(priv->status, bbio->bio.bi_status);  	}  	if (!atomic_dec_return(&priv->pending))  		wake_up(&priv->wait); -	btrfs_bio_free_csum(bbio);  	bio_put(&bbio->bio);  } @@ -10368,47 +9939,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,  					  u64 file_offset, u64 disk_bytenr,  					  u64 disk_io_size, struct page **pages)  { -	struct btrfs_fs_info *fs_info = inode->root->fs_info;  	struct btrfs_encoded_read_private priv = {  		.inode = inode,  		.file_offset = file_offset,  		.pending = ATOMIC_INIT(1), -		.skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),  	};  	unsigned long i = 0;  	u64 cur = 0; -	int ret;  	init_waitqueue_head(&priv.wait); -	/* -	 * Submit bios for the extent, splitting due to bio or stripe limits as -	 * necessary. -	 */ +	/* Submit bios for the extent, splitting due to bio limits as necessary. */  	while (cur < disk_io_size) { -		struct extent_map *em; -		struct btrfs_io_geometry geom;  		struct bio *bio = NULL; -		u64 remaining; +		u64 remaining = disk_io_size - cur; -		em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, -					 disk_io_size - cur); -		if (IS_ERR(em)) { -			ret = PTR_ERR(em); -		} else { -			ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, -						    disk_bytenr + cur, &geom); -			free_extent_map(em); -		} -		if (ret) { -			WRITE_ONCE(priv.status, errno_to_blk_status(ret)); -			break; -		} -		remaining = min(geom.len, disk_io_size - cur);  		while (bio || remaining) {  			size_t bytes = min_t(u64, remaining, PAGE_SIZE);  			if (!bio) {  				bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, +						      inode,  						      btrfs_encoded_read_endio,  						      &priv);  				bio->bi_iter.bi_sector = @@ -10417,14 +9967,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,  			if (!bytes ||  			    bio_add_page(bio, pages[i], bytes, 0) < bytes) { -				blk_status_t status; - -				status = submit_encoded_read_bio(inode, bio, 0); -				if (status) { -					WRITE_ONCE(priv.status, status); -					bio_put(bio); -					goto out; -				} +				atomic_inc(&priv.pending); +				btrfs_submit_bio(bio, 0);  				bio = NULL;  				continue;  			} @@ -10435,7 +9979,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,  		}  	} -out:  	if (atomic_dec_return(&priv.pending))  		io_wait_event(priv.wait, !atomic_read(&priv.pending));  	/* See btrfs_encoded_read_endio() for ordering. */ @@ -10995,9 +10538,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,  		return 0;  	max_pages = sis->max - bsi->nr_pages; -	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; -	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, -				PAGE_SIZE) >> PAGE_SHIFT; +	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; +	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;  	if (first_ppage >= next_ppage)  		return 0; |