aboutsummaryrefslogtreecommitdiff
path: root/fs/iomap.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/iomap.c')
-rw-r--r--fs/iomap.c159
1 files changed, 118 insertions, 41 deletions
diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2ec696..97cb9d486a7d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -30,7 +30,6 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include <linux/sched/signal.h>
-#include <linux/swap.h>
#include "internal.h"
@@ -117,6 +116,12 @@ iomap_page_create(struct inode *inode, struct page *page)
atomic_set(&iop->read_count, 0);
atomic_set(&iop->write_count, 0);
bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+ /*
+ * migrate_page_move_mapping() assumes that pages with private data have
+ * their count elevated by 1.
+ */
+ get_page(page);
set_page_private(page, (unsigned long)iop);
SetPagePrivate(page);
return iop;
@@ -133,6 +138,7 @@ iomap_page_release(struct page *page)
WARN_ON_ONCE(atomic_read(&iop->write_count));
ClearPagePrivate(page);
set_page_private(page, 0);
+ put_page(page);
kfree(iop);
}
@@ -143,13 +149,14 @@ static void
iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
{
+ loff_t orig_pos = *pos;
+ loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits;
unsigned block_size = (1 << block_bits);
unsigned poff = offset_in_page(*pos);
unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
- unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
/*
* If the block size is smaller than the page size we need to check the
@@ -184,8 +191,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
- if (first <= end && last > end)
- plen -= (last - end) * block_size;
+ if (orig_pos <= isize && orig_pos + length > isize) {
+ unsigned end = offset_in_page(isize - 1) >> block_bits;
+
+ if (first <= end && last > end)
+ plen -= (last - end) * block_size;
+ }
*offp = poff;
*lenp = plen;
@@ -263,8 +274,9 @@ iomap_read_end_io(struct bio *bio)
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
iomap_read_page_end_io(bvec, error);
bio_put(bio);
}
@@ -313,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
*/
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
- if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+ if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -344,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
- __bio_add_page(ctx->bio, page, plen, poff);
+ bio_add_page(ctx->bio, page, plen, poff);
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -488,16 +500,29 @@ done:
}
EXPORT_SYMBOL_GPL(iomap_readpages);
+/*
+ * iomap_is_partially_uptodate checks whether blocks within a page are
+ * uptodate or not.
+ *
+ * Returns true if all blocks which correspond to a file portion
+ * we want to read within the page are uptodate.
+ */
int
iomap_is_partially_uptodate(struct page *page, unsigned long from,
unsigned long count)
{
struct iomap_page *iop = to_iomap_page(page);
struct inode *inode = page->mapping->host;
- unsigned first = from >> inode->i_blkbits;
- unsigned last = (from + count - 1) >> inode->i_blkbits;
+ unsigned len, first, last;
unsigned i;
+ /* Limit range to one page */
+ len = min_t(unsigned, PAGE_SIZE - from, count);
+
+ /* First and last blocks in range within page */
+ first = from >> inode->i_blkbits;
+ last = (from + len - 1) >> inode->i_blkbits;
+
if (iop) {
for (i = first; i <= last; i++)
if (!test_bit(i, iop->uptodate))
@@ -546,14 +571,16 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
{
int ret;
- ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
if (page_has_private(page)) {
ClearPagePrivate(page);
+ get_page(newpage);
set_page_private(newpage, page_private(page));
set_page_private(page, 0);
+ put_page(page);
SetPagePrivate(newpage);
}
@@ -1057,7 +1084,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
return length;
}
-int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -1437,6 +1464,28 @@ struct iomap_dio {
};
};
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+ struct request_queue *q = READ_ONCE(kiocb->private);
+
+ if (!q)
+ return 0;
+ return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+ struct bio *bio)
+{
+ atomic_inc(&dio->ref);
+
+ if (dio->iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, dio->iocb);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+}
+
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
@@ -1526,7 +1575,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
- wake_up_process(waiter);
+ blk_wake_io_task(waiter);
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
@@ -1542,18 +1591,20 @@ static void iomap_dio_bio_end_io(struct bio *bio)
} else {
struct bio_vec *bvec;
int i;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page);
bio_put(bio);
}
}
-static blk_qc_t
+static void
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
{
struct page *page = ZERO_PAGE(0);
+ int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 1);
@@ -1564,10 +1615,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
get_page(page);
__bio_add_page(bio, page, len, 0);
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
-
- atomic_inc(&dio->ref);
- return submit_bio(bio);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+ iomap_dio_submit_bio(dio, iomap, bio);
}
static loff_t
@@ -1581,7 +1630,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
- int nr_pages, ret;
+ int nr_pages, ret = 0;
size_t copied = 0;
if ((pos | length | align) & ((1 << blkbits) - 1))
@@ -1597,12 +1646,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
- } else {
+ } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this
- * is a pure data IO that doesn't require any metadata
- * updates and the underlying device supports FUA. This
- * allows us to avoid cache flushes on IO completion.
+ * Use a FUA write if we need datasync semantics, this is a pure
+ * data IO that doesn't require any metadata updates (including
+ * after IO completion such as unwritten extent conversion) and
+ * the underlying device supports FUA. This allows us to avoid
+ * cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
(dio->flags & IOMAP_DIO_WRITE_FUA) &&
@@ -1645,8 +1695,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
ret = bio_iov_iter_get_pages(bio, &iter);
if (unlikely(ret)) {
+ /*
+ * We have to stop part way through an IO. We must fall
+ * through to the sub-block tail zeroing here, otherwise
+ * this short IO may expose stale data in the tail of
+ * the block we haven't written data to.
+ */
bio_put(bio);
- return copied ? copied : ret;
+ goto zero_tail;
}
n = bio->bi_iter.bi_size;
@@ -1670,20 +1726,24 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
copied += n;
nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-
- atomic_inc(&dio->ref);
-
- dio->submit.last_queue = bdev_get_queue(iomap->bdev);
- dio->submit.cookie = submit_bio(bio);
+ iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
- if (need_zeroout) {
+ /*
+ * We need to zeroout the tail of a sub-block write if the extent type
+ * requires zeroing or the write extends beyond EOF. If we don't zero
+ * the block tail in the latter case, we can expose stale data via mmap
+ * reads of the EOF block.
+ */
+zero_tail:
+ if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
- return copied;
+ return copied ? copied : ret;
}
static loff_t
@@ -1765,6 +1825,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos, start = pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
+ bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
@@ -1784,7 +1845,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->end_io = end_io;
dio->error = 0;
dio->flags = 0;
- dio->wait_for_completion = is_sync_kiocb(iocb);
dio->submit.iter = iter;
dio->submit.waiter = current;
@@ -1795,7 +1855,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
- if (iter->type == ITER_IOVEC)
+ if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
@@ -1839,7 +1899,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
- if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
+ if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
!inode->i_sb->s_dio_done_wq) {
ret = sb_init_dio_done_wq(inode->i_sb);
if (ret < 0)
@@ -1855,7 +1915,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
- dio->wait_for_completion = true;
+ wait_for_completion = true;
ret = 0;
}
break;
@@ -1877,8 +1937,27 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+ WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+ /*
+ * We are about to drop our additional submission reference, which
+ * might be the last reference to the dio. There are three three
+ * different ways we can progress here:
+ *
+ * (a) If this is the last reference we will always complete and free
+ * the dio ourselves.
+ * (b) If this is not the last reference, and we serve an asynchronous
+ * iocb, we must never touch the dio after the decrement, the
+ * I/O completion handler will complete and free it.
+ * (c) If this is not the last reference, but we serve a synchronous
+ * iocb, the I/O completion handler will wake us up on the drop
+ * of the final reference, and we will complete and free it here
+ * after we got woken by the I/O completion handler.
+ */
+ dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
- if (!dio->wait_for_completion)
+ if (!wait_for_completion)
return -EIOCBQUEUED;
for (;;) {
@@ -1889,15 +1968,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!dio->submit.last_queue ||
!blk_poll(dio->submit.last_queue,
- dio->submit.cookie))
+ dio->submit.cookie, true))
io_schedule();
}
__set_current_state(TASK_RUNNING);
}
- ret = iomap_dio_complete(dio);
-
- return ret;
+ return iomap_dio_complete(dio);
out_free_dio:
kfree(dio);