diff options
Diffstat (limited to 'fs/btrfs/file.c')
| -rw-r--r-- | fs/btrfs/file.c | 331 | 
1 files changed, 254 insertions, 77 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f9d76072398d..e764ac3f22e2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -128,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,  		      struct extent_state **cached, bool noreserve)  {  	struct btrfs_fs_info *fs_info = inode->root->fs_info; -	int err = 0; +	int ret = 0;  	int i;  	u64 num_bytes;  	u64 start_pos; @@ -158,10 +158,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,  			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,  			 cached); -	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, +	ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,  					extra_bits, cached); -	if (err) -		return err; +	if (ret) +		return ret;  	for (i = 0; i < num_pages; i++) {  		struct page *p = pages[i]; @@ -206,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct extent_buffer *leaf;  	struct btrfs_file_extent_item *fi; -	struct btrfs_ref ref = { 0 };  	struct btrfs_key key;  	struct btrfs_key new_key;  	u64 ino = btrfs_ino(inode); @@ -246,7 +245,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,  	if (args->start >= inode->disk_i_size && !args->replace_extent)  		modify_tree = 0; -	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); +	update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);  	while (1) {  		recow = 0;  		ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -373,15 +372,17 @@ next_slot:  			btrfs_mark_buffer_dirty(trans, leaf);  			if (update_refs && disk_bytenr > 0) { -				btrfs_init_generic_ref(&ref, -						BTRFS_ADD_DELAYED_REF, -						disk_bytenr, num_bytes, 0, -						root->root_key.objectid); -				btrfs_init_data_ref(&ref, -						root->root_key.objectid, -						new_key.objectid, -						args->start - extent_offset, -						0, false); +				struct btrfs_ref ref = { +					.action = BTRFS_ADD_DELAYED_REF, +					.bytenr = disk_bytenr, +					.num_bytes = num_bytes, +					.parent = 0, +					.owning_root = btrfs_root_id(root), +					.ref_root = btrfs_root_id(root), +				}; +				btrfs_init_data_ref(&ref, new_key.objectid, +						    args->start - extent_offset, +						    0, false);  				ret = btrfs_inc_extent_ref(trans, &ref);  				if (ret) {  					btrfs_abort_transaction(trans, ret); @@ -464,15 +465,17 @@ delete_extent_item:  				extent_end = ALIGN(extent_end,  						   fs_info->sectorsize);  			} else if (update_refs && disk_bytenr > 0) { -				btrfs_init_generic_ref(&ref, -						BTRFS_DROP_DELAYED_REF, -						disk_bytenr, num_bytes, 0, -						root->root_key.objectid); -				btrfs_init_data_ref(&ref, -						root->root_key.objectid, -						key.objectid, -						key.offset - extent_offset, 0, -						false); +				struct btrfs_ref ref = { +					.action = BTRFS_DROP_DELAYED_REF, +					.bytenr = disk_bytenr, +					.num_bytes = num_bytes, +					.parent = 0, +					.owning_root = btrfs_root_id(root), +					.ref_root = btrfs_root_id(root), +				}; +				btrfs_init_data_ref(&ref, key.objectid, +						    key.offset - extent_offset, +						    0, false);  				ret = btrfs_free_extent(trans, &ref);  				if (ret) {  					btrfs_abort_transaction(trans, ret); @@ -748,10 +751,13 @@ again:  						extent_end - split);  		btrfs_mark_buffer_dirty(trans, leaf); -		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, -				       num_bytes, 0, root->root_key.objectid); -		btrfs_init_data_ref(&ref, root->root_key.objectid, ino, -				    orig_offset, 0, false); +		ref.action = BTRFS_ADD_DELAYED_REF; +		ref.bytenr = bytenr; +		ref.num_bytes = num_bytes; +		ref.parent = 0; +		ref.owning_root = btrfs_root_id(root); +		ref.ref_root = btrfs_root_id(root); +		btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);  		ret = btrfs_inc_extent_ref(trans, &ref);  		if (ret) {  			btrfs_abort_transaction(trans, ret); @@ -774,10 +780,14 @@ again:  	other_start = end;  	other_end = 0; -	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, -			       num_bytes, 0, root->root_key.objectid); -	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, -			    0, false); + +	ref.action = BTRFS_DROP_DELAYED_REF; +	ref.bytenr = bytenr; +	ref.num_bytes = num_bytes; +	ref.parent = 0; +	ref.owning_root = btrfs_root_id(root); +	ref.ref_root = btrfs_root_id(root); +	btrfs_init_data_ref(&ref, ino, orig_offset, 0, false);  	if (extent_mergeable(leaf, path->slots[0] + 1,  			     ino, bytenr, orig_offset,  			     &other_start, &other_end)) { @@ -915,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,  	unsigned long index = pos >> PAGE_SHIFT;  	gfp_t mask = get_prepare_gfp_flags(inode, nowait);  	fgf_t fgp_flags = get_prepare_fgp_flags(nowait); -	int err = 0; +	int ret = 0;  	int faili;  	for (i = 0; i < num_pages; i++) { @@ -925,28 +935,28 @@ again:  		if (!pages[i]) {  			faili = i - 1;  			if (nowait) -				err = -EAGAIN; +				ret = -EAGAIN;  			else -				err = -ENOMEM; +				ret = -ENOMEM;  			goto fail;  		} -		err = set_page_extent_mapped(pages[i]); -		if (err < 0) { +		ret = set_page_extent_mapped(pages[i]); +		if (ret < 0) {  			faili = i;  			goto fail;  		}  		if (i == 0) -			err = prepare_uptodate_page(inode, pages[i], pos, +			ret = prepare_uptodate_page(inode, pages[i], pos,  						    force_uptodate); -		if (!err && i == num_pages - 1) -			err = prepare_uptodate_page(inode, pages[i], +		if (!ret && i == num_pages - 1) +			ret = prepare_uptodate_page(inode, pages[i],  						    pos + write_bytes, false); -		if (err) { +		if (ret) {  			put_page(pages[i]); -			if (!nowait && err == -EAGAIN) { -				err = 0; +			if (!nowait && ret == -EAGAIN) { +				ret = 0;  				goto again;  			}  			faili = i - 1; @@ -962,7 +972,7 @@ fail:  		put_page(pages[faili]);  		faili--;  	} -	return err; +	return ret;  } @@ -1465,7 +1475,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)  	ssize_t written_buffered;  	size_t prev_left = 0;  	loff_t endbyte; -	ssize_t err; +	ssize_t ret;  	unsigned int ilock_flags = 0;  	struct iomap_dio *dio; @@ -1482,9 +1492,9 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)  		ilock_flags |= BTRFS_ILOCK_SHARED;  relock: -	err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); -	if (err < 0) -		return err; +	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); +	if (ret < 0) +		return ret;  	/* Shared lock cannot be used with security bits set. */  	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { @@ -1493,14 +1503,14 @@ relock:  		goto relock;  	} -	err = generic_write_checks(iocb, from); -	if (err <= 0) { +	ret = generic_write_checks(iocb, from); +	if (ret <= 0) {  		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); -		return err; +		return ret;  	} -	err = btrfs_write_check(iocb, from, err); -	if (err < 0) { +	ret = btrfs_write_check(iocb, from, ret); +	if (ret < 0) {  		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);  		goto out;  	} @@ -1552,15 +1562,15 @@ relock:  	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);  	if (IS_ERR_OR_NULL(dio)) -		err = PTR_ERR_OR_ZERO(dio); +		ret = PTR_ERR_OR_ZERO(dio);  	else -		err = iomap_dio_complete(dio); +		ret = iomap_dio_complete(dio);  	/* No increment (+=) because iomap returns a cumulative value. */ -	if (err > 0) -		written = err; +	if (ret > 0) +		written = ret; -	if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { +	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {  		const size_t left = iov_iter_count(from);  		/*  		 * We have more data left to write. Try to fault in as many as @@ -1577,7 +1587,7 @@ relock:  		 * to buffered IO in case we haven't made any progress.  		 */  		if (left == prev_left) { -			err = -ENOTBLK; +			ret = -ENOTBLK;  		} else {  			fault_in_iov_iter_readable(from, left);  			prev_left = left; @@ -1586,10 +1596,10 @@ relock:  	}  	/* -	 * If 'err' is -ENOTBLK or we have not written all data, then it means +	 * If 'ret' is -ENOTBLK or we have not written all data, then it means  	 * we must fallback to buffered IO.  	 */ -	if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) +	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))  		goto out;  buffered: @@ -1600,14 +1610,14 @@ buffered:  	 * below, we will block when flushing and waiting for the IO.  	 */  	if (iocb->ki_flags & IOCB_NOWAIT) { -		err = -EAGAIN; +		ret = -EAGAIN;  		goto out;  	}  	pos = iocb->ki_pos;  	written_buffered = btrfs_buffered_write(iocb, from);  	if (written_buffered < 0) { -		err = written_buffered; +		ret = written_buffered;  		goto out;  	}  	/* @@ -1615,18 +1625,18 @@ buffered:  	 * able to read what was just written.  	 */  	endbyte = pos + written_buffered - 1; -	err = btrfs_fdatawrite_range(inode, pos, endbyte); -	if (err) +	ret = btrfs_fdatawrite_range(inode, pos, endbyte); +	if (ret)  		goto out; -	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); -	if (err) +	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); +	if (ret)  		goto out;  	written += written_buffered;  	iocb->ki_pos = pos + written_buffered;  	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,  				 endbyte >> PAGE_SHIFT);  out: -	return err < 0 ? err : written; +	return ret < 0 ? ret : written;  }  static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -2029,6 +2039,172 @@ out_release_extents:  	goto out;  } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF.  Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ +	struct page *page = vmf->page; +	struct folio *folio = page_folio(page); +	struct inode *inode = file_inode(vmf->vma->vm_file); +	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +	struct btrfs_ordered_extent *ordered; +	struct extent_state *cached_state = NULL; +	struct extent_changeset *data_reserved = NULL; +	unsigned long zero_start; +	loff_t size; +	vm_fault_t ret; +	int ret2; +	int reserved = 0; +	u64 reserved_space; +	u64 page_start; +	u64 page_end; +	u64 end; + +	ASSERT(folio_order(folio) == 0); + +	reserved_space = PAGE_SIZE; + +	sb_start_pagefault(inode->i_sb); +	page_start = page_offset(page); +	page_end = page_start + PAGE_SIZE - 1; +	end = page_end; + +	/* +	 * Reserving delalloc space after obtaining the page lock can lead to +	 * deadlock. For example, if a dirty page is locked by this function +	 * and the call to btrfs_delalloc_reserve_space() ends up triggering +	 * dirty page write out, then the btrfs_writepages() function could +	 * end up waiting indefinitely to get a lock on the page currently +	 * being processed by btrfs_page_mkwrite() function. +	 */ +	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, +					    page_start, reserved_space); +	if (!ret2) { +		ret2 = file_update_time(vmf->vma->vm_file); +		reserved = 1; +	} +	if (ret2) { +		ret = vmf_error(ret2); +		if (reserved) +			goto out; +		goto out_noreserve; +	} + +	/* Make the VM retry the fault. */ +	ret = VM_FAULT_NOPAGE; +again: +	down_read(&BTRFS_I(inode)->i_mmap_lock); +	lock_page(page); +	size = i_size_read(inode); + +	if ((page->mapping != inode->i_mapping) || +	    (page_start >= size)) { +		/* Page got truncated out from underneath us. */ +		goto out_unlock; +	} +	wait_on_page_writeback(page); + +	lock_extent(io_tree, page_start, page_end, &cached_state); +	ret2 = set_page_extent_mapped(page); +	if (ret2 < 0) { +		ret = vmf_error(ret2); +		unlock_extent(io_tree, page_start, page_end, &cached_state); +		goto out_unlock; +	} + +	/* +	 * We can't set the delalloc bits if there are pending ordered +	 * extents.  Drop our locks and wait for them to finish. +	 */ +	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); +	if (ordered) { +		unlock_extent(io_tree, page_start, page_end, &cached_state); +		unlock_page(page); +		up_read(&BTRFS_I(inode)->i_mmap_lock); +		btrfs_start_ordered_extent(ordered); +		btrfs_put_ordered_extent(ordered); +		goto again; +	} + +	if (page->index == ((size - 1) >> PAGE_SHIFT)) { +		reserved_space = round_up(size - page_start, fs_info->sectorsize); +		if (reserved_space < PAGE_SIZE) { +			end = page_start + reserved_space - 1; +			btrfs_delalloc_release_space(BTRFS_I(inode), +					data_reserved, page_start, +					PAGE_SIZE - reserved_space, true); +		} +	} + +	/* +	 * page_mkwrite gets called when the page is firstly dirtied after it's +	 * faulted in, but write(2) could also dirty a page and set delalloc +	 * bits, thus in this case for space account reason, we still need to +	 * clear any delalloc bits within this page range since we have to +	 * reserve data&meta space before lock_page() (see above comments). +	 */ +	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, +			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | +			  EXTENT_DEFRAG, &cached_state); + +	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, +					&cached_state); +	if (ret2) { +		unlock_extent(io_tree, page_start, page_end, &cached_state); +		ret = VM_FAULT_SIGBUS; +		goto out_unlock; +	} + +	/* Page is wholly or partially inside EOF. */ +	if (page_start + PAGE_SIZE > size) +		zero_start = offset_in_page(size); +	else +		zero_start = PAGE_SIZE; + +	if (zero_start != PAGE_SIZE) +		memzero_page(page, zero_start, PAGE_SIZE - zero_start); + +	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); +	btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); +	btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + +	btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + +	unlock_extent(io_tree, page_start, page_end, &cached_state); +	up_read(&BTRFS_I(inode)->i_mmap_lock); + +	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); +	sb_end_pagefault(inode->i_sb); +	extent_changeset_free(data_reserved); +	return VM_FAULT_LOCKED; + +out_unlock: +	unlock_page(page); +	up_read(&BTRFS_I(inode)->i_mmap_lock); +out: +	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); +	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, +				     reserved_space, (ret != 0)); +out_noreserve: +	sb_end_pagefault(inode->i_sb); +	extent_changeset_free(data_reserved); +	return ret; +} +  static const struct vm_operations_struct btrfs_file_vm_ops = {  	.fault		= filemap_fault,  	.map_pages	= filemap_map_pages, @@ -2258,7 +2434,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,  	struct extent_buffer *leaf;  	struct btrfs_key key;  	int slot; -	struct btrfs_ref ref = { 0 };  	int ret;  	if (replace_len == 0) @@ -2314,15 +2489,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,  						       extent_info->qgroup_reserved,  						       &key);  	} else { +		struct btrfs_ref ref = { +			.action = BTRFS_ADD_DELAYED_REF, +			.bytenr = extent_info->disk_offset, +			.num_bytes = extent_info->disk_len, +			.owning_root = btrfs_root_id(root), +			.ref_root = btrfs_root_id(root), +		};  		u64 ref_offset; -		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, -				       extent_info->disk_offset, -				       extent_info->disk_len, 0, -				       root->root_key.objectid);  		ref_offset = extent_info->file_offset - extent_info->data_offset; -		btrfs_init_data_ref(&ref, root->root_key.objectid, -				    btrfs_ino(inode), ref_offset, 0, false); +		btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false);  		ret = btrfs_inc_extent_ref(trans, &ref);  	} @@ -3719,8 +3896,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)  {  	int ret; -	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | -		        FMODE_CAN_ODIRECT; +	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;  	ret = fsverity_file_open(inode, filp);  	if (ret) @@ -3850,6 +4026,7 @@ const struct file_operations btrfs_file_operations = {  	.compat_ioctl	= btrfs_compat_ioctl,  #endif  	.remap_file_range = btrfs_remap_file_range, +	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,  };  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)  |