diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 349 | 
1 files changed, 240 insertions, 109 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a0233710..a314fc7b56fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@  #include "xfs_icache.h"  #include "xfs_pnfs.h"  #include "xfs_iomap.h" +#include "xfs_reflink.h"  #include <linux/dcache.h>  #include <linux/falloc.h> @@ -269,6 +270,8 @@ xfs_file_dio_aio_read(  		return -EINVAL;  	} +	file_accessed(iocb->ki_filp); +  	/*  	 * Locking is a bit tricky here. If we take an exclusive lock for direct  	 * IO, we effectively serialise all new concurrent read IO to this file @@ -317,13 +320,12 @@ xfs_file_dio_aio_read(  	data = *to;  	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,  			xfs_get_blocks_direct, NULL, NULL, 0); -	if (ret > 0) { +	if (ret >= 0) {  		iocb->ki_pos += ret;  		iov_iter_advance(to, ret);  	}  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -	file_accessed(iocb->ki_filp);  	return ret;  } @@ -332,10 +334,7 @@ xfs_file_dax_read(  	struct kiocb		*iocb,  	struct iov_iter		*to)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; -	struct xfs_inode	*ip = XFS_I(inode); -	struct iov_iter		data = *to; +	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);  	size_t			count = iov_iter_count(to);  	ssize_t			ret = 0; @@ -345,11 +344,7 @@ xfs_file_dax_read(  		return 0; /* skip atime */  	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(to, ret); -	} +	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);  	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);  	file_accessed(iocb->ki_filp); @@ -399,45 +394,6 @@ xfs_file_read_iter(  	return ret;  } -STATIC ssize_t -xfs_file_splice_read( -	struct file		*infilp, -	loff_t			*ppos, -	struct pipe_inode_info	*pipe, -	size_t			count, -	unsigned int		flags) -{ -	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host); -	ssize_t			ret; - -	XFS_STATS_INC(ip->i_mount, xs_read_calls); - -	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) -		return -EIO; - -	trace_xfs_file_splice_read(ip, count, *ppos); - -	/* -	 * DAX inodes cannot ues the page cache for splice, so we have to push -	 * them through the VFS IO path. This means it goes through -	 * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we -	 * cannot lock the splice operation at this level for DAX inodes. -	 */ -	if (IS_DAX(VFS_I(ip))) { -		ret = default_file_splice_read(infilp, ppos, pipe, count, -					       flags); -		goto out; -	} - -	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); -	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); -	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -out: -	if (ret > 0) -		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); -	return ret; -} -  /*   * Zero any on disk space between the current EOF and the new, larger EOF.   * @@ -679,6 +635,13 @@ xfs_file_dio_aio_write(  	trace_xfs_file_direct_write(ip, count, iocb->ki_pos); +	/* If this is a block-aligned directio CoW, remap immediately. */ +	if (xfs_is_reflink_inode(ip) && !unaligned_io) { +		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); +		if (ret) +			goto out; +	} +  	data = *from;  	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,  			xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -711,70 +674,32 @@ xfs_file_dax_write(  	struct kiocb		*iocb,  	struct iov_iter		*from)  { -	struct address_space	*mapping = iocb->ki_filp->f_mapping; -	struct inode		*inode = mapping->host; +	struct inode		*inode = iocb->ki_filp->f_mapping->host;  	struct xfs_inode	*ip = XFS_I(inode); -	struct xfs_mount	*mp = ip->i_mount; -	ssize_t			ret = 0; -	int			unaligned_io = 0; -	int			iolock; -	struct iov_iter		data; +	int			iolock = XFS_IOLOCK_EXCL; +	ssize_t			ret, error = 0; +	size_t			count; +	loff_t			pos; -	/* "unaligned" here means not aligned to a filesystem block */ -	if ((iocb->ki_pos & mp->m_blockmask) || -	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { -		unaligned_io = 1; -		iolock = XFS_IOLOCK_EXCL; -	} else if (mapping->nrpages) { -		iolock = XFS_IOLOCK_EXCL; -	} else { -		iolock = XFS_IOLOCK_SHARED; -	}  	xfs_rw_ilock(ip, iolock); -  	ret = xfs_file_aio_write_checks(iocb, from, &iolock);  	if (ret)  		goto out; -	/* -	 * Yes, even DAX files can have page cache attached to them:  A zeroed -	 * page is inserted into the pagecache when we have to serve a write -	 * fault on a hole.  It should never be dirtied and can simply be -	 * dropped from the pagecache once we get real data for the page. -	 * -	 * XXX: This is racy against mmap, and there's nothing we can do about -	 * it. dax_do_io() should really do this invalidation internally as -	 * it will know if we've allocated over a holei for this specific IO and -	 * if so it needs to update the mapping tree and invalidate existing -	 * PTEs over the newly allocated range. Remove this invalidation when -	 * dax_do_io() is fixed up. -	 */ -	if (mapping->nrpages) { -		loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; +	pos = iocb->ki_pos; +	count = iov_iter_count(from); -		ret = invalidate_inode_pages2_range(mapping, -						    iocb->ki_pos >> PAGE_SHIFT, -						    end >> PAGE_SHIFT); -		WARN_ON_ONCE(ret); -	} +	trace_xfs_file_dax_write(ip, count, pos); -	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { -		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); -		iolock = XFS_IOLOCK_SHARED; +	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); +	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { +		i_size_write(inode, iocb->ki_pos); +		error = xfs_setfilesize(ip, pos, ret);  	} -	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - -	data = *from; -	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, -			xfs_end_io_direct_write, 0); -	if (ret > 0) { -		iocb->ki_pos += ret; -		iov_iter_advance(from, ret); -	}  out:  	xfs_rw_iunlock(ip, iolock); -	return ret; +	return error ? error : ret;  }  STATIC ssize_t @@ -818,6 +743,9 @@ write_retry:  		enospc = xfs_inode_free_quota_eofblocks(ip);  		if (enospc)  			goto write_retry; +		enospc = xfs_inode_free_quota_cowblocks(ip); +		if (enospc) +			goto write_retry;  	} else if (ret == -ENOSPC && !enospc) {  		struct xfs_eofblocks eofb = {0}; @@ -857,10 +785,20 @@ xfs_file_write_iter(  	if (IS_DAX(inode))  		ret = xfs_file_dax_write(iocb, from); -	else if (iocb->ki_flags & IOCB_DIRECT) +	else if (iocb->ki_flags & IOCB_DIRECT) { +		/* +		 * Allow a directio write to fall back to a buffered +		 * write *only* in the case that we're doing a reflink +		 * CoW.  In all other directio scenarios we do not +		 * allow an operation to fall back to buffered mode. +		 */  		ret = xfs_file_dio_aio_write(iocb, from); -	else +		if (ret == -EREMCHG) +			goto buffered; +	} else { +buffered:  		ret = xfs_file_buffered_aio_write(iocb, from); +	}  	if (ret > 0) {  		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -874,7 +812,7 @@ xfs_file_write_iter(  #define	XFS_FALLOC_FL_SUPPORTED						\  		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\  		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\ -		 FALLOC_FL_INSERT_RANGE) +		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)  STATIC long  xfs_file_fallocate( @@ -964,9 +902,15 @@ xfs_file_fallocate(  		if (mode & FALLOC_FL_ZERO_RANGE)  			error = xfs_zero_file_space(ip, offset, len); -		else +		else { +			if (mode & FALLOC_FL_UNSHARE_RANGE) { +				error = xfs_reflink_unshare(ip, offset, len); +				if (error) +					goto out_unlock; +			}  			error = xfs_alloc_file_space(ip, offset, len,  						     XFS_BMAPI_PREALLOC); +		}  		if (error)  			goto out_unlock;  	} @@ -984,7 +928,7 @@ xfs_file_fallocate(  		iattr.ia_valid = ATTR_SIZE;  		iattr.ia_size = new_size; -		error = xfs_setattr_size(ip, &iattr); +		error = xfs_vn_setattr_size(file_dentry(file), &iattr);  		if (error)  			goto out_unlock;  	} @@ -1003,6 +947,189 @@ out_unlock:  	return error;  } +/* + * Flush all file writes out to disk. + */ +static int +xfs_file_wait_for_io( +	struct inode	*inode, +	loff_t		offset, +	size_t		len) +{ +	loff_t		rounding; +	loff_t		ioffset; +	loff_t		iendoffset; +	loff_t		bs; +	int		ret; + +	bs = inode->i_sb->s_blocksize; +	inode_dio_wait(inode); + +	rounding = max_t(xfs_off_t, bs, PAGE_SIZE); +	ioffset = round_down(offset, rounding); +	iendoffset = round_up(offset + len, rounding) - 1; +	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, +					   iendoffset); +	return ret; +} + +/* Hook up to the VFS reflink function */ +STATIC int +xfs_file_share_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	u64		len, +	bool		is_dedupe) +{ +	struct inode	*inode_in; +	struct inode	*inode_out; +	ssize_t		ret; +	loff_t		bs; +	loff_t		isize; +	int		same_inode; +	loff_t		blen; +	unsigned int	flags = 0; + +	inode_in = file_inode(file_in); +	inode_out = file_inode(file_out); +	bs = inode_out->i_sb->s_blocksize; + +	/* Don't touch certain kinds of inodes */ +	if (IS_IMMUTABLE(inode_out)) +		return -EPERM; +	if (IS_SWAPFILE(inode_in) || +	    IS_SWAPFILE(inode_out)) +		return -ETXTBSY; + +	/* Reflink only works within this filesystem. */ +	if (inode_in->i_sb != inode_out->i_sb) +		return -EXDEV; +	same_inode = (inode_in->i_ino == inode_out->i_ino); + +	/* Don't reflink dirs, pipes, sockets... */ +	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) +		return -EISDIR; +	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) +		return -EINVAL; +	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) +		return -EINVAL; + +	/* Don't share DAX file data for now. */ +	if (IS_DAX(inode_in) || IS_DAX(inode_out)) +		return -EINVAL; + +	/* Are we going all the way to the end? */ +	isize = i_size_read(inode_in); +	if (isize == 0) +		return 0; +	if (len == 0) +		len = isize - pos_in; + +	/* Ensure offsets don't wrap and the input is inside i_size */ +	if (pos_in + len < pos_in || pos_out + len < pos_out || +	    pos_in + len > isize) +		return -EINVAL; + +	/* Don't allow dedupe past EOF in the dest file */ +	if (is_dedupe) { +		loff_t	disize; + +		disize = i_size_read(inode_out); +		if (pos_out >= disize || pos_out + len > disize) +			return -EINVAL; +	} + +	/* If we're linking to EOF, continue to the block boundary. */ +	if (pos_in + len == isize) +		blen = ALIGN(isize, bs) - pos_in; +	else +		blen = len; + +	/* Only reflink if we're aligned to block boundaries */ +	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || +	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) +		return -EINVAL; + +	/* Don't allow overlapped reflink within the same file */ +	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) +		return -EINVAL; + +	/* Wait for the completion of any pending IOs on srcfile */ +	ret = xfs_file_wait_for_io(inode_in, pos_in, len); +	if (ret) +		goto out; +	ret = xfs_file_wait_for_io(inode_out, pos_out, len); +	if (ret) +		goto out; + +	if (is_dedupe) +		flags |= XFS_REFLINK_DEDUPE; +	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), +			pos_out, len, flags); +	if (ret < 0) +		goto out; + +out: +	return ret; +} + +STATIC ssize_t +xfs_file_copy_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	size_t		len, +	unsigned int	flags) +{ +	int		error; + +	error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, +				     len, false); +	if (error) +		return error; +	return len; +} + +STATIC int +xfs_file_clone_range( +	struct file	*file_in, +	loff_t		pos_in, +	struct file	*file_out, +	loff_t		pos_out, +	u64		len) +{ +	return xfs_file_share_range(file_in, pos_in, file_out, pos_out, +				     len, false); +} + +#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( +	struct file	*src_file, +	u64		loff, +	u64		len, +	struct file	*dst_file, +	u64		dst_loff) +{ +	int		error; + +	/* +	 * Limit the total length we will dedupe for each operation. +	 * This is intended to bound the total time spent in this +	 * ioctl to something sane. +	 */ +	if (len > XFS_MAX_DEDUPE_LEN) +		len = XFS_MAX_DEDUPE_LEN; + +	error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, +				     len, true); +	if (error) +		return error; +	return len; +}  STATIC int  xfs_file_open( @@ -1513,7 +1640,7 @@ xfs_filemap_page_mkwrite(  	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);  	if (IS_DAX(inode)) { -		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else {  		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);  		ret = block_page_mkwrite_return(ret); @@ -1547,7 +1674,7 @@ xfs_filemap_fault(  		 * changes to xfs_get_blocks_direct() to map unwritten extent  		 * ioend for conversion on read-only mappings.  		 */ -		ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); +		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);  	} else  		ret = filemap_fault(vma, vmf);  	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1652,7 +1779,7 @@ const struct file_operations xfs_file_operations = {  	.llseek		= xfs_file_llseek,  	.read_iter	= xfs_file_read_iter,  	.write_iter	= xfs_file_write_iter, -	.splice_read	= xfs_file_splice_read, +	.splice_read	= generic_file_splice_read,  	.splice_write	= iter_file_splice_write,  	.unlocked_ioctl	= xfs_file_ioctl,  #ifdef CONFIG_COMPAT @@ -1662,7 +1789,11 @@ const struct file_operations xfs_file_operations = {  	.open		= xfs_file_open,  	.release	= xfs_file_release,  	.fsync		= xfs_file_fsync, +	.get_unmapped_area = thp_get_unmapped_area,  	.fallocate	= xfs_file_fallocate, +	.copy_file_range = xfs_file_copy_range, +	.clone_file_range = xfs_file_clone_range, +	.dedupe_file_range = xfs_file_dedupe_range,  };  const struct file_operations xfs_dir_file_operations = {  |