diff options
Diffstat (limited to 'fs/direct-io.c')
| -rw-r--r-- | fs/direct-io.c | 75 | 
1 files changed, 62 insertions, 13 deletions
| diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..b53e66d9abd7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -45,6 +45,12 @@  #define DIO_PAGES	64  /* + * Flags for dio_complete() + */ +#define DIO_COMPLETE_ASYNC		0x01	/* This is async IO */ +#define DIO_COMPLETE_INVALIDATE		0x02	/* Can invalidate pages */ + +/*   * This code generally works in units of "dio_blocks".  A dio_block is   * somewhere between the hard sector size and the filesystem block size.  it   * is determined on a per-invocation basis.   When talking to the filesystem @@ -225,10 +231,11 @@ static inline struct page *dio_get_page(struct dio *dio,   * filesystems can use it to hold additional state between get_block calls and   * dio_complete.   */ -static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) +static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)  {  	loff_t offset = dio->iocb->ki_pos;  	ssize_t transferred = 0; +	int err;  	/*  	 * AIO submission can race with bio completion to get here while @@ -259,18 +266,37 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)  		ret = transferred;  	if (dio->end_io) { -		int err; -  		// XXX: ki_pos??  		err = dio->end_io(dio->iocb, offset, ret, dio->private);  		if (err)  			ret = err;  	} +	/* +	 * Try again to invalidate clean pages which might have been cached by +	 * non-direct readahead, or faulted in by get_user_pages() if the source +	 * of the write was an mmap'ed region of the file we're writing.  Either +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If +	 * this invalidation fails, tough, the write still worked... +	 * +	 * And this page cache invalidation has to be after dio->end_io(), as +	 * some filesystems convert unwritten extents to real allocations in +	 * end_io() when necessary, otherwise a racing buffer read would cache +	 * zeros from unwritten extents. +	 */ +	if (flags & DIO_COMPLETE_INVALIDATE && +	    ret > 0 && dio->op == REQ_OP_WRITE && +	    dio->inode->i_mapping->nrpages) { +		err = invalidate_inode_pages2_range(dio->inode->i_mapping, +					offset >> PAGE_SHIFT, +					(offset + ret - 1) >> PAGE_SHIFT); +		WARN_ON_ONCE(err); +	} +  	if (!(dio->flags & DIO_SKIP_DIO_COUNT))  		inode_dio_end(dio->inode); -	if (is_async) { +	if (flags & DIO_COMPLETE_ASYNC) {  		/*  		 * generic_write_sync expects ki_pos to have been updated  		 * already, but the submission path only does this for @@ -291,7 +317,7 @@ static void dio_aio_complete_work(struct work_struct *work)  {  	struct dio *dio = container_of(work, struct dio, complete_work); -	dio_complete(dio, 0, true); +	dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);  }  static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); @@ -304,6 +330,7 @@ static void dio_bio_end_aio(struct bio *bio)  	struct dio *dio = bio->bi_private;  	unsigned long remaining;  	unsigned long flags; +	bool defer_completion = false;  	/* cleanup the bio */  	dio_bio_complete(dio, bio); @@ -315,12 +342,24 @@ static void dio_bio_end_aio(struct bio *bio)  	spin_unlock_irqrestore(&dio->bio_lock, flags);  	if (remaining == 0) { -		if (dio->result && dio->defer_completion) { +		/* +		 * Defer completion when defer_completion is set or +		 * when the inode has pages mapped and this is AIO write. +		 * We need to invalidate those pages because there is a +		 * chance they contain stale data in the case buffered IO +		 * went in between AIO submission and completion into the +		 * same region. +		 */ +		if (dio->result) +			defer_completion = dio->defer_completion || +					   (dio->op == REQ_OP_WRITE && +					    dio->inode->i_mapping->nrpages); +		if (defer_completion) {  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);  			queue_work(dio->inode->i_sb->s_dio_done_wq,  				   &dio->complete_work);  		} else { -			dio_complete(dio, 0, true); +			dio_complete(dio, 0, DIO_COMPLETE_ASYNC);  		}  	}  } @@ -838,7 +877,8 @@ out:  	 */  	if (sdio->boundary) {  		ret = dio_send_cur_page(dio, sdio, map_bh); -		dio_bio_submit(dio, sdio); +		if (sdio->bio) +			dio_bio_submit(dio, sdio);  		put_page(sdio->cur_page);  		sdio->cur_page = NULL;  	} @@ -1210,10 +1250,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue  	 * so that we can call ->fsync.  	 */ -	if (dio->is_async && iov_iter_rw(iter) == WRITE && -	    ((iocb->ki_filp->f_flags & O_DSYNC) || -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) { -		retval = dio_set_defer_completion(dio); +	if (dio->is_async && iov_iter_rw(iter) == WRITE) { +		retval = 0; +		if ((iocb->ki_filp->f_flags & O_DSYNC) || +		    IS_SYNC(iocb->ki_filp->f_mapping->host)) +			retval = dio_set_defer_completion(dio); +		else if (!dio->inode->i_sb->s_dio_done_wq) { +			/* +			 * In case of AIO write racing with buffered read we +			 * need to defer completion. We can't decide this now, +			 * however the workqueue needs to be initialized here. +			 */ +			retval = sb_init_dio_done_wq(dio->inode->i_sb); +		}  		if (retval) {  			/*  			 * We grab i_mutex only for reads so we don't have @@ -1322,7 +1371,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,  		dio_await_completion(dio);  	if (drop_refcount(dio) == 0) { -		retval = dio_complete(dio, retval, false); +		retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);  	} else  		BUG_ON(retval != -EIOCBQUEUED); |