aboutsummaryrefslogtreecommitdiff
path: root/fs/iomap/buffered-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/iomap/buffered-io.c')
-rw-r--r--fs/iomap/buffered-io.c199
1 files changed, 123 insertions, 76 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9b4ca3811a24..11ea747228ae 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,7 +23,6 @@
#define IOEND_BATCH_SIZE 4096
-typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -1022,13 +1021,14 @@ retry:
ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(i),
.flags = IOMAP_WRITE,
+ .private = private,
};
ssize_t ret;
@@ -1046,15 +1046,14 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
unsigned int first_blk, last_blk, i;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
- int ret = 0;
/*
* When we have per-block dirty tracking, there can be
@@ -1064,47 +1063,35 @@ static int iomap_write_delalloc_ifs_punch(struct inode *inode,
*/
ifs = folio->private;
if (!ifs)
- return ret;
+ return;
last_byte = min_t(loff_t, end_byte - 1,
folio_pos(folio) + folio_size(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i)) {
- ret = punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits);
- if (ret)
- return ret;
- }
+ if (!ifs_block_is_dirty(folio, ifs, i))
+ punch(inode, folio_pos(folio) + (i << blkbits),
+ 1 << blkbits, iomap);
}
-
- return ret;
}
-
-static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
- int ret = 0;
-
if (!folio_test_dirty(folio))
- return ret;
+ return;
/* if dirty, punch up to offset */
if (start_byte > *punch_start_byte) {
- ret = punch(inode, *punch_start_byte,
- start_byte - *punch_start_byte);
- if (ret)
- return ret;
+ punch(inode, *punch_start_byte, start_byte - *punch_start_byte,
+ iomap);
}
/* Punch non-dirty blocks within folio */
- ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
- end_byte, punch);
- if (ret)
- return ret;
+ iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte,
+ iomap, punch);
/*
* Make sure the next punch start is correctly bound to
@@ -1112,8 +1099,6 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
*/
*punch_start_byte = min_t(loff_t, end_byte,
folio_pos(folio) + folio_size(folio));
-
- return ret;
}
/*
@@ -1133,13 +1118,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
* simplify range iterations.
*/
-static int iomap_write_delalloc_scan(struct inode *inode,
+static void iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
while (start_byte < end_byte) {
struct folio *folio;
- int ret;
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
@@ -1150,20 +1134,14 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue;
}
- ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
- start_byte, end_byte, punch);
- if (ret) {
- folio_unlock(folio);
- folio_put(folio);
- return ret;
- }
+ iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+ start_byte, end_byte, iomap, punch);
/* move offset to start of next folio in range */
start_byte = folio_next_index(folio) << PAGE_SHIFT;
folio_unlock(folio);
folio_put(folio);
}
- return 0;
}
/*
@@ -1199,12 +1177,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
* the code to subtle off-by-one bugs....
*/
-static int iomap_write_delalloc_release(struct inode *inode,
- loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
+static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+ loff_t end_byte, unsigned flags, struct iomap *iomap,
+ iomap_punch_t punch)
{
loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
- int error = 0;
/*
* Lock the mapping to avoid races with page faults re-instantiating
@@ -1221,13 +1199,15 @@ static int iomap_write_delalloc_release(struct inode *inode,
/*
* If there is no more data to scan, all that is left is to
* punch out the remaining range.
+ *
+ * Note that mapping_seek_hole_data is only supposed to return
+ * either an offset or -ENXIO, so WARN on any other error as
+ * that would be an API change without updating the callers.
*/
if (start_byte == -ENXIO || start_byte == scan_end_byte)
break;
- if (start_byte < 0) {
- error = start_byte;
+ if (WARN_ON_ONCE(start_byte < 0))
goto out_unlock;
- }
WARN_ON_ONCE(start_byte < punch_start_byte);
WARN_ON_ONCE(start_byte > scan_end_byte);
@@ -1237,28 +1217,31 @@ static int iomap_write_delalloc_release(struct inode *inode,
*/
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
scan_end_byte, SEEK_HOLE);
- if (data_end < 0) {
- error = data_end;
+ if (WARN_ON_ONCE(data_end < 0))
goto out_unlock;
- }
- WARN_ON_ONCE(data_end <= start_byte);
+
+ /*
+ * If we race with post-direct I/O invalidation of the page cache,
+ * there might be no data left at start_byte.
+ */
+ if (data_end == start_byte)
+ continue;
+
+ WARN_ON_ONCE(data_end < start_byte);
WARN_ON_ONCE(data_end > scan_end_byte);
- error = iomap_write_delalloc_scan(inode, &punch_start_byte,
- start_byte, data_end, punch);
- if (error)
- goto out_unlock;
+ iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte,
+ data_end, iomap, punch);
/* The next data search starts at the end of this one. */
start_byte = data_end;
}
if (punch_start_byte < end_byte)
- error = punch(inode, punch_start_byte,
- end_byte - punch_start_byte);
+ punch(inode, punch_start_byte, end_byte - punch_start_byte,
+ iomap);
out_unlock:
filemap_invalidate_unlock(inode->i_mapping);
- return error;
}
/*
@@ -1291,20 +1274,20 @@ out_unlock:
* ->punch
* internal filesystem allocation lock
*/
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- struct iomap *iomap, loff_t pos, loff_t length,
- ssize_t written, iomap_punch_t punch)
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ loff_t pos, loff_t length, ssize_t written, unsigned flags,
+ struct iomap *iomap, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
unsigned int blocksize = i_blocksize(inode);
if (iomap->type != IOMAP_DELALLOC)
- return 0;
+ return;
/* If we didn't reserve the blocks, we're not allowed to punch them. */
if (!(iomap->flags & IOMAP_F_NEW))
- return 0;
+ return;
/*
* start_byte refers to the first unused block after a short write. If
@@ -1319,26 +1302,35 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
/* Nothing to do if we've written the entire delalloc extent */
if (start_byte >= end_byte)
- return 0;
+ return;
- return iomap_write_delalloc_release(inode, start_byte, end_byte,
- punch);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
+ punch);
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* don't bother with blocks that are not shared to start with */
+ /* Don't bother with blocks that are not shared to start with. */
if (!(iomap->flags & IOMAP_F_SHARED))
return length;
- /* don't bother with holes or unwritten extents */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+
+ /*
+ * Don't bother with holes or unwritten extents.
+ *
+ * Note that we use srcmap directly instead of iomap_iter_srcmap as
+ * unsharing requires providing a separate source map, and the presence
+ * of one is a good indicator that unsharing is needed, unlike
+ * IOMAP_F_SHARED which can be set for any data that goes into the COW
+ * fork for XFS.
+ */
+ if (iter->srcmap.type == IOMAP_HOLE ||
+ iter->srcmap.type == IOMAP_UNWRITTEN)
return length;
do {
@@ -1393,16 +1385,53 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+/*
+ * Flush the remaining range of the iter and mark the current mapping stale.
+ * This is used when zero range sees an unwritten mapping that may have had
+ * dirty pagecache over it.
+ */
+static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
+{
+ struct address_space *mapping = i->inode->i_mapping;
+ loff_t end = i->pos + i->len - 1;
+
+ i->iomap.flags |= IOMAP_F_STALE;
+ return filemap_write_and_wait_range(mapping, i->pos, end);
+}
+
+static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+ bool *range_dirty)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* already zeroed? we're done. */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ /*
+ * We must zero subranges of unwritten mappings that might be dirty in
+ * pagecache from previous writes. We only know whether the entire range
+ * was clean or not, however, and dirty folios may have been written
+ * back or reclaimed at any point after mapping lookup.
+ *
+ * The easiest way to deal with this is to flush pagecache to trigger
+ * any pending unwritten conversions and then grab the updated extents
+ * from the fs. The flush may change the current mapping, so mark it
+ * stale for the iterator to remap it for the next pass to handle
+ * properly.
+ *
+ * Note that holes are treated the same as unwritten because zero range
+ * is (ab)used for partial folio zeroing in some cases. Hole backed
+ * post-eof ranges can be dirtied via mapped write and the flush
+ * triggers writeback time post-eof zeroing.
+ */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) {
+ if (*range_dirty) {
+ *range_dirty = false;
+ return iomap_zero_iter_flush_and_stale(iter);
+ }
+ /* range is clean and already zeroed, nothing to do */
return length;
+ }
do {
struct folio *folio;
@@ -1450,9 +1479,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.flags = IOMAP_ZERO,
};
int ret;
+ bool range_dirty;
+
+ /*
+ * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but
+ * pagecache must be flushed to ensure stale data from previous
+ * buffered writes is not exposed. A flush is only required for certain
+ * types of mappings, but checking pagecache after mapping lookup is
+ * racy with writeback and reclaim.
+ *
+ * Therefore, check the entire range first and pass along whether any
+ * part of it is dirty. If so and an underlying mapping warrants it,
+ * flush the cache at that point. This trades off the occasional false
+ * positive (and spurious flush, if the dirty data and mapping don't
+ * happen to overlap) for simplicity in handling a relatively uncommon
+ * situation.
+ */
+ range_dirty = filemap_range_needs_writeback(inode->i_mapping,
+ pos, pos + len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_zero_range);
@@ -2007,10 +2054,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-static int __init iomap_init(void)
+static int __init iomap_buffered_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
-fs_initcall(iomap_init);
+fs_initcall(iomap_buffered_init);