diff options
Diffstat (limited to 'fs/xfs/xfs_file.c')
| -rw-r--r-- | fs/xfs/xfs_file.c | 178 | 
1 files changed, 111 insertions, 67 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3b7591224f4a..f0e8249722d4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -41,6 +41,7 @@  #include <linux/dcache.h>  #include <linux/falloc.h>  #include <linux/pagevec.h> +#include <linux/backing-dev.h>  static const struct vm_operations_struct xfs_file_vm_ops; @@ -79,14 +80,15 @@ xfs_rw_ilock_demote(  }  /* - *	xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed.   * - *	xfs_iozero clears the specified range of buffer supplied, - *	and marks all the affected blocks as valid and modified.  If - *	an affected block is not allocated, it will be allocated.  If - *	an affected block is not completely overwritten, and is not - *	valid before the operation, it will be read from disk before - *	being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work.   */  int  xfs_iozero( @@ -96,7 +98,8 @@ xfs_iozero(  {  	struct page		*page;  	struct address_space	*mapping; -	int			status; +	int			status = 0; +  	mapping = VFS_I(ip)->i_mapping;  	do { @@ -108,20 +111,27 @@ xfs_iozero(  		if (bytes > count)  			bytes = count; -		status = pagecache_write_begin(NULL, mapping, pos, bytes, -					AOP_FLAG_UNINTERRUPTIBLE, -					&page, &fsdata); -		if (status) -			break; +		if (IS_DAX(VFS_I(ip))) { +			status = dax_zero_page_range(VFS_I(ip), pos, bytes, +						     xfs_get_blocks_direct); +			if (status) +				break; +		} else { +			status = pagecache_write_begin(NULL, mapping, pos, bytes, +						AOP_FLAG_UNINTERRUPTIBLE, +						&page, &fsdata); +			if (status) +				break; -		zero_user(page, offset, bytes); +			zero_user(page, offset, bytes); -		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, -					page, fsdata); -		WARN_ON(status <= 0); /* can't return less than zero! */ +			status = pagecache_write_end(NULL, mapping, pos, bytes, +						bytes, page, fsdata); +			WARN_ON(status <= 0); /* can't return less than zero! */ +			status = 0; +		}  		pos += bytes;  		count -= bytes; -		status = 0;  	} while (count);  	return status; @@ -138,7 +148,7 @@ xfs_update_prealloc_flags(  	tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);  	error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);  	if (error) { -		xfs_trans_cancel(tp, 0); +		xfs_trans_cancel(tp);  		return error;  	} @@ -160,7 +170,7 @@ xfs_update_prealloc_flags(  	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);  	if (flags & XFS_PREALLOC_SYNC)  		xfs_trans_set_sync(tp); -	return xfs_trans_commit(tp, 0); +	return xfs_trans_commit(tp);  }  /* @@ -284,7 +294,7 @@ xfs_file_read_iter(  	if (file->f_mode & FMODE_NOCMTIME)  		ioflags |= XFS_IO_INVIS; -	if (unlikely(ioflags & XFS_IO_ISDIRECT)) { +	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {  		xfs_buftarg_t	*target =  			XFS_IS_REALTIME_INODE(ip) ?  				mp->m_rtdev_targp : mp->m_ddev_targp; @@ -378,7 +388,11 @@ xfs_file_splice_read(  	trace_xfs_file_splice_read(ip, count, *ppos, ioflags); -	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); +	/* for dax, we need to avoid the page cache */ +	if (IS_DAX(VFS_I(ip))) +		ret = default_file_splice_read(infilp, ppos, pipe, count, flags); +	else +		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);  	if (ret > 0)  		XFS_STATS_ADD(xs_read_bytes, ret); @@ -563,6 +577,13 @@ restart:  	if (error)  		return error; +	/* For changing security info in file_remove_privs() we need i_mutex */ +	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { +		xfs_rw_iunlock(ip, *iolock); +		*iolock = XFS_IOLOCK_EXCL; +		xfs_rw_ilock(ip, *iolock); +		goto restart; +	}  	/*  	 * If the offset is beyond the size of the file, we need to zero any  	 * blocks that fall between the existing EOF and the start of this @@ -623,7 +644,9 @@ restart:  	 * setgid bits if the process is not being run by root.  This keeps  	 * people from modifying setuid and setgid binaries.  	 */ -	return file_remove_suid(file); +	if (!IS_NOSEC(inode)) +		return file_remove_privs(file); +	return 0;  }  /* @@ -672,7 +695,7 @@ xfs_file_dio_aio_write(  					mp->m_rtdev_targp : mp->m_ddev_targp;  	/* DIO must be aligned to device logical sector size */ -	if ((pos | count) & target->bt_logical_sectormask) +	if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))  		return -EINVAL;  	/* "unaligned" here means not aligned to a filesystem block */ @@ -758,8 +781,11 @@ xfs_file_dio_aio_write(  out:  	xfs_rw_iunlock(ip, iolock); -	/* No fallback to buffered IO on errors for XFS. */ -	ASSERT(ret < 0 || ret == count); +	/* +	 * No fallback to buffered IO on errors for XFS. DAX can result in +	 * partial writes, but direct IO will either complete fully or fail. +	 */ +	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));  	return ret;  } @@ -842,7 +868,7 @@ xfs_file_write_iter(  	if (XFS_FORCED_SHUTDOWN(ip->i_mount))  		return -EIO; -	if (unlikely(iocb->ki_flags & IOCB_DIRECT)) +	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))  		ret = xfs_file_dio_aio_write(iocb, from);  	else  		ret = xfs_file_buffered_aio_write(iocb, from); @@ -1063,17 +1089,6 @@ xfs_file_readdir(  	return xfs_readdir(ip, ctx, bufsize);  } -STATIC int -xfs_file_mmap( -	struct file	*filp, -	struct vm_area_struct *vma) -{ -	vma->vm_ops = &xfs_file_vm_ops; - -	file_accessed(filp); -	return 0; -} -  /*   * This type is designed to indicate the type of offset we would like   * to search from page cache for xfs_seek_hole_data(). @@ -1454,48 +1469,83 @@ xfs_file_llseek(   * ordering of:   *   * mmap_sem (MM) - *   i_mmap_lock (XFS - truncate serialisation) - *     page_lock (MM) - *       i_lock (XFS - extent map serialisation) + *   sb_start_pagefault(vfs, freeze) + *     i_mmap_lock (XFS - truncate serialisation) + *       page_lock (MM) + *         i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping.   */  STATIC int -xfs_filemap_fault( +xfs_filemap_page_mkwrite(  	struct vm_area_struct	*vma,  	struct vm_fault		*vmf)  { -	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host); -	int			error; +	struct inode		*inode = file_inode(vma->vm_file); +	int			ret; -	trace_xfs_filemap_fault(ip); +	trace_xfs_filemap_page_mkwrite(XFS_I(inode)); -	xfs_ilock(ip, XFS_MMAPLOCK_SHARED); -	error = filemap_fault(vma, vmf); -	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); +	sb_start_pagefault(inode->i_sb); +	file_update_time(vma->vm_file); +	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); -	return error; +	if (IS_DAX(inode)) { +		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, +				    xfs_end_io_dax_write); +	} else { +		ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); +		ret = block_page_mkwrite_return(ret); +	} + +	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); +	sb_end_pagefault(inode->i_sb); + +	return ret;  } -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */  STATIC int -xfs_filemap_page_mkwrite( +xfs_filemap_fault(  	struct vm_area_struct	*vma,  	struct vm_fault		*vmf)  { -	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host); -	int			error; +	struct xfs_inode	*ip = XFS_I(file_inode(vma->vm_file)); +	int			ret; -	trace_xfs_filemap_page_mkwrite(ip); +	trace_xfs_filemap_fault(ip); + +	/* DAX can shortcut the normal fault path on write faults! */ +	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) +		return xfs_filemap_page_mkwrite(vma, vmf);  	xfs_ilock(ip, XFS_MMAPLOCK_SHARED); -	error = block_page_mkwrite(vma, vmf, xfs_get_blocks); +	ret = filemap_fault(vma, vmf);  	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); -	return error; +	return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { +	.fault		= xfs_filemap_fault, +	.map_pages	= filemap_map_pages, +	.page_mkwrite	= xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( +	struct file	*filp, +	struct vm_area_struct *vma) +{ +	file_accessed(filp); +	vma->vm_ops = &xfs_file_vm_ops; +	if (IS_DAX(file_inode(filp))) +		vma->vm_flags |= VM_MIXEDMAP; +	return 0;  }  const struct file_operations xfs_file_operations = { @@ -1526,9 +1576,3 @@ const struct file_operations xfs_dir_file_operations = {  #endif  	.fsync		= xfs_dir_fsync,  }; - -static const struct vm_operations_struct xfs_file_vm_ops = { -	.fault		= xfs_filemap_fault, -	.map_pages	= filemap_map_pages, -	.page_mkwrite	= xfs_filemap_page_mkwrite, -};  |