aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c940
1 files changed, 609 insertions, 331 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 46f392943f4d..487533c35ddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
+#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
@@ -51,6 +52,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
struct btrfs_iget_args {
u64 ino;
@@ -166,22 +168,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *page;
while (index <= end_index) {
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
- ClearPagePrivate2(page);
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then __endio_write_update_ordered() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
put_page(page);
}
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
+ return;
/*
* In case this page belongs to the delalloc range being instantiated
* then skip it, since the first page of a range is going to be
* properly cleaned up by the caller of run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- offset += PAGE_SIZE;
- bytes -= PAGE_SIZE;
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
}
return __endio_write_update_ordered(inode, offset, bytes, false);
@@ -260,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
- kaddr = kmap_atomic(cpage);
+ kaddr = page_address(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap_atomic(kaddr);
i++;
ptr += cur_size;
@@ -464,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow,
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
+ /* Subpage doesn't support compression yet */
+ if (inode->root->fs_info->sectorsize < PAGE_SIZE)
+ return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
@@ -656,7 +685,11 @@ again:
}
}
cont:
- if (start == 0) {
+ /*
+ * Check cow_file_range() for why we don't even try to create inline
+ * extent for subpage case.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
@@ -946,7 +979,8 @@ retry:
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+ btrfs_writepage_endio_finish_ordered(inode, p, start,
+ end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1053,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
- if (start == 0) {
+ /*
+ * Due to the page size limit, for subpage we can only trigger the
+ * writeback for the dirty sectors of page, that means data writeback
+ * is doing more writeback than what we want.
+ *
+ * This is especially unexpected for some call sites like fallocate,
+ * where we only increase i_size after everything is done.
+ * This means we can trigger inline extent even if we didn't want to.
+ * So here we skip inline extent creation completely.
+ */
+ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
@@ -1064,7 +1108,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1072,6 +1117,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
@@ -1150,15 +1208,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /* we're not doing compressed IO, don't unlock the first
- * page (which the caller expects to stay locked), don't
- * clear any dirty bits and don't set any writeback bits
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
*
- * Do set the Private2 bit so we know this page was properly
- * setup for writepage
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
- page_ops |= PAGE_SET_PRIVATE2;
+ page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
@@ -1248,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1261,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -1822,7 +1881,7 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
@@ -1904,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
+ ASSERT(ret <= 0);
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
@@ -2193,26 +2253,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ u32 bio_len = bio->bi_iter.bi_size;
struct extent_map *em;
- u64 length = 0;
- u64 map_length;
int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_iter.bi_size;
- map_length = length;
- em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
- map_length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
if (ret < 0)
goto out;
- if (geom.len < length + size)
+ if (geom.len < bio_len + size)
ret = 1;
out:
free_extent_map(em);
@@ -2233,29 +2289,117 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size)
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ u64 pre, u64 post)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered;
- u64 len = bio->bi_iter.bi_size + size;
- bool ret = true;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ struct extent_map *em;
+ struct extent_map *split_pre = NULL;
+ struct extent_map *split_mid = NULL;
+ struct extent_map *split_post = NULL;
+ int ret = 0;
+ unsigned long flags;
- ASSERT(btrfs_is_zoned(fs_info));
- ASSERT(fs_info->max_zone_append_size > 0);
- ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
+ /* Sanity check */
+ if (pre == 0 && post == 0)
+ return 0;
- /* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
- if (!ordered)
- return ret;
+ split_pre = alloc_extent_map();
+ if (pre)
+ split_mid = alloc_extent_map();
+ if (post)
+ split_post = alloc_extent_map();
+ if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
- ordered->disk_bytenr + ordered->disk_num_bytes)
- ret = false;
+ ASSERT(pre + post < len);
- btrfs_put_ordered_extent(ordered);
+ lock_extent(&inode->io_tree, start, start + len - 1);
+ write_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, start, len);
+ if (!em) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ASSERT(em->len == len);
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(!list_empty(&em->list));
+
+ flags = em->flags;
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ /* First, replace the em with a new extent_map starting from * em->start */
+ split_pre->start = em->start;
+ split_pre->len = (pre ? pre : em->len - post);
+ split_pre->orig_start = split_pre->start;
+ split_pre->block_start = em->block_start;
+ split_pre->block_len = split_pre->len;
+ split_pre->orig_block_len = split_pre->block_len;
+ split_pre->ram_bytes = split_pre->len;
+ split_pre->flags = flags;
+ split_pre->compress_type = em->compress_type;
+ split_pre->generation = em->generation;
+
+ replace_extent_mapping(em_tree, em, split_pre, 1);
+
+ /*
+ * Now we only have an extent_map at:
+ * [em->start, em->start + pre] if pre != 0
+ * [em->start, em->start + em->len - post] if pre == 0
+ */
+
+ if (pre) {
+ /* Insert the middle extent_map */
+ split_mid->start = em->start + pre;
+ split_mid->len = em->len - pre - post;
+ split_mid->orig_start = split_mid->start;
+ split_mid->block_start = em->block_start + pre;
+ split_mid->block_len = split_mid->len;
+ split_mid->orig_block_len = split_mid->block_len;
+ split_mid->ram_bytes = split_mid->len;
+ split_mid->flags = flags;
+ split_mid->compress_type = em->compress_type;
+ split_mid->generation = em->generation;
+ add_extent_mapping(em_tree, split_mid, 1);
+ }
+
+ if (post) {
+ split_post->start = em->start + em->len - post;
+ split_post->len = post;
+ split_post->orig_start = split_post->start;
+ split_post->block_start = em->block_start + em->len - post;
+ split_post->block_len = split_post->len;
+ split_post->orig_block_len = split_post->block_len;
+ split_post->ram_bytes = split_post->len;
+ split_post->flags = flags;
+ split_post->compress_type = em->compress_type;
+ split_post->generation = em->generation;
+ add_extent_mapping(em_tree, split_post, 1);
+ }
+
+ /* Once for us */
+ free_extent_map(em);
+ /* Once for the tree */
+ free_extent_map(em);
+
+out_unlock:
+ write_unlock(&em_tree->lock);
+ unlock_extent(&inode->io_tree, start, start + len - 1);
+out:
+ free_extent_map(split_pre);
+ free_extent_map(split_mid);
+ free_extent_map(split_post);
return ret;
}
@@ -2264,9 +2408,8 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
- struct extent_map *em = NULL, *em_new = NULL;
- struct extent_map_tree *em_tree = &inode->extent_tree;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
@@ -2306,41 +2449,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
goto out;
}
+ file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
- if (!em) {
- read_unlock(&em_tree->lock);
- ret = -EIO;
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
- /*
- * We cannot reuse em_new here but have to create a new one, as
- * unpin_extent_cache() expects the start of the extent map to be the
- * logical offset of the file, which does not hold true anymore after
- * splitting.
- */
- em_new = create_io_em(inode, em->start + pre, len,
- em->start + pre, em->block_start + pre, len,
- len, len, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_REGULAR);
- if (IS_ERR(em_new)) {
- ret = PTR_ERR(em_new);
- goto out;
- }
- free_extent_map(em_new);
+ ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
- free_extent_map(em);
btrfs_put_ordered_extent(ordered);
return errno_to_blk_status(ret);
@@ -2601,7 +2719,7 @@ again:
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PagePrivate2(page))
+ if (PageOrdered(page))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
@@ -2670,14 +2788,14 @@ out_page:
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
-int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- /* this page is properly in the ordered list */
- if (TestClearPagePrivate2(page))
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
return 0;
/*
@@ -2773,7 +2891,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
- * number of bytes only for that range contaning the inline extent.
+ * number of bytes only for that range containing the inline extent.
* The remaining of the range will be processed when clearning the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
@@ -2892,7 +3010,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (ordered_extent->disk)
+ if (ordered_extent->bdev)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_free_io_failure_record(inode, start, end);
@@ -3069,28 +3187,14 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
- u64 end, int uptodate)
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
+ u64 end, bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *wq;
-
- trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
-
- ClearPagePrivate2(page);
- if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1, uptodate))
- return;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered_extent->work);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
+ finish_ordered_fn, uptodate);
}
/*
@@ -3152,46 +3256,73 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
+ unsigned int result = 0;
if (PageChecked(page)) {
ClearPageChecked(page);
return 0;
}
- if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ /*
+ * For subpage case, above PageChecked is not safe as it's not subpage
+ * compatible.
+ * But for now only cow fixup and compressed read utilize PageChecked
+ * flag, while in this context we can easily use io_bio->csum to
+ * determine if we really need to do csum verification.
+ *
+ * So for now, just exit if io_bio->csum is NULL, as it means it's
+ * compressed read, and its compressed data csum has already been
+ * verified.
+ */
+ if (io_bio->csum == NULL)
return 0;
- if (!root->fs_info->csum_root)
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
- clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
+ if (!root->fs_info->csum_root)
return 0;
- }
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
+ u64 file_offset = pg_off + page_offset(page);
int ret;
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+ test_range_bit(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, 1, NULL)) {
+ /* Skip the range without csum for data reloc inode */
+ clear_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM);
+ continue;
+ }
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
- if (ret < 0)
- return -EIO;
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
}
- return 0;
+ return result;
}
/*
@@ -3426,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/*
* If we have an inode with links, there are a couple of
- * possibilities. Old kernels (before v3.12) used to create an
+ * possibilities:
+ *
+ * 1. We were halfway through creating fsverity metadata for the
+ * file. In that case, the orphan item represents incomplete
+ * fsverity metadata which must be cleaned up with
+ * btrfs_drop_verity_items and deleting the orphan item.
+
+ * 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
@@ -3444,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
- if (!ret)
+ if (!ret) {
+ ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
+ if (ret)
+ goto out;
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3634,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+ btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
+ &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
@@ -3765,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_map_token token;
+ u64 flags;
btrfs_init_map_token(&token, leaf);
@@ -3800,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
- btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
+ BTRFS_I(inode)->ro_flags);
+ btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
@@ -4109,7 +4255,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
- * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
@@ -4464,20 +4610,36 @@ out:
#define NEED_TRUNCATE_BLOCK 1
/*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
+ * Remove inode items from a given root.
+ *
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @new_size: The new i_size for the inode. This is only applicable when
+ * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
+ * @min_type: The minimum key type to remove. All keys with a type
+ * greater than this value are removed and all keys with
+ * this type are removed only if their offset is >= @new_size.
+ * @extents_found: Output parameter that will contain the number of file
+ * extent items that were removed or adjusted to the new
+ * inode i_size. The caller is responsible for initializing
+ * the counter. Also, it can be NULL if the caller does not
+ * need this counter.
*
- * csum items that cross the new i_size are truncated to the new size
- * as well.
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
*
- * min_type is the minimum key type to truncate down to. If set to 0, this
- * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- u64 new_size, u32 min_type)
+ u64 new_size, u32 min_type,
+ u64 *extents_found)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -4623,6 +4785,9 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
+ if (extents_found != NULL)
+ (*extents_found)++;
+
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
@@ -4941,7 +5106,7 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4975,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
int ret;
/*
- * Still need to make sure the inode looks like it's been updated so
- * that any holes get logged if we fsync.
+ * If NO_HOLES is enabled, we don't need to do anything.
+ * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
+ * or btrfs_update_inode() will be called, which guarantee that the next
+ * fsync will know this inode was changed and needs to be logged.
*/
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- inode->last_trans = fs_info->generation;
- inode->last_sub_trans = root->log_transid;
- inode->last_log_commit = root->last_log_commit;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
- }
/*
* 1 - for the one we're dropping
@@ -5229,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -5240,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
if (attr->ia_valid) {
- setattr_copy(&init_user_ns, inode, attr);
+ setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(&init_user_ns, inode,
- inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
@@ -5409,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
if (!root) {
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
@@ -5455,7 +5618,7 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0);
+ 0, 0, NULL);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -5491,6 +5654,7 @@ no_delete:
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
+ fsverity_cleanup_inode(inode);
clear_inode(inode);
}
@@ -6257,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
@@ -6366,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail_unlock;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
@@ -6551,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6615,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
- mode, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -6760,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_fail;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
@@ -7937,19 +8103,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
btrfs_ino(BTRFS_I(inode)),
pgoff);
} else {
- blk_status_t status;
+ int ret;
ASSERT((start - io_bio->logical) < UINT_MAX);
- status = btrfs_submit_read_repair(inode,
- &io_bio->bio,
- start - io_bio->logical,
- bvec.bv_page, pgoff,
- start,
- start + sectorsize - 1,
- io_bio->mirror_num,
- submit_dio_repair_bio);
- if (status)
- err = status;
+ ret = btrfs_repair_one_sector(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start, io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
@@ -7964,41 +8128,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered = NULL;
- struct btrfs_workqueue *wq;
- u64 ordered_offset = offset;
- u64 ordered_bytes = bytes;
- u64 last_offset;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- while (ordered_offset < offset + bytes) {
- last_offset = ordered_offset;
- if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
- NULL);
- btrfs_queue_work(wq, &ordered->work);
- }
-
- /* No ordered extent found in the range, exit */
- if (ordered_offset == last_offset)
- return;
- /*
- * Our bio might span multiple ordered extents. In this case
- * we keep going until we have accounted the whole dio.
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- }
- }
+ btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
+ finish_ordered_fn, uptodate);
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -8116,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
return dip;
}
-static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
+ struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
@@ -8128,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
u64 start_sector;
int async_submit = 0;
u64 submit_len;
- int clone_offset = 0;
- int clone_len;
+ u64 clone_offset = 0;
+ u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
- struct btrfs_dio_data *dio_data = iomap->private;
+ struct btrfs_dio_data *dio_data = iter->iomap.private;
struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
@@ -8172,14 +8304,14 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err_em;
}
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
- logical, submit_len, &geom);
+ logical, &geom);
if (ret) {
status = errno_to_blk_status(ret);
goto out_err_em;
}
- ASSERT(geom.len <= INT_MAX);
- clone_len = min_t(int, submit_len, geom.len);
+ clone_len = min(submit_len, geom.len);
+ ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
@@ -8276,15 +8408,14 @@ int btrfs_readpage(struct file *file, struct page *page)
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- unsigned long bio_flags = 0;
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
- if (bio)
- ret = submit_one_bio(bio, 0, bio_flags);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ if (bio_ctrl.bio)
+ ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
return ret;
}
@@ -8324,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac)
extent_readahead(rac);
}
+/*
+ * For releasepage() and invalidatepage() we have a race window where
+ * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * If we continue to release/invalidate the page, we could cause use-after-free
+ * for subpage spinlock. So this function is to spin and wait for subpage
+ * spinlock.
+ */
+static void wait_subpage_spinlock(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ /*
+ * This may look insane as we just acquire the spinlock and release it,
+ * without doing anything. But we just want to make sure no one is
+ * still holding the subpage spinlock.
+ * And since the page is not dirty nor writeback, and we have page
+ * locked, the only possible way to hold a spinlock is from the endio
+ * function to clear page writeback.
+ *
+ * Here we just acquire the spinlock so that all existing callers
+ * should exit and we're safe to release/invalidate the page.
+ */
+ spin_lock_irq(&subpage->lock);
+ spin_unlock_irq(&subpage->lock);
+}
+
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
- if (ret == 1)
+
+ if (ret == 1) {
+ wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
+ }
return ret;
}
@@ -8353,9 +8520,9 @@ static int btrfs_migratepage(struct address_space *mapping,
if (page_has_private(page))
attach_page_private(newpage, detach_page_private(page));
- if (PagePrivate2(page)) {
- ClearPagePrivate2(page);
- SetPagePrivate2(newpage);
+ if (PageOrdered(page)) {
+ ClearPageOrdered(page);
+ SetPageOrdered(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -8370,27 +8537,43 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
- u64 start;
- u64 end;
+ u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
- bool found_ordered = false;
- bool completed_ordered = false;
/*
- * we have the page locked, so new writeback can't start,
- * and the dirty bit won't be cleared while we are here.
+ * We have page locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this page.
*
- * Wait for IO on this page so that we can safely clear
- * the PagePrivate2 bit and do ordered accounting
+ * But already submitted bio can still be finished on this page.
+ * Furthermore, endio function won't skip page which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidatepage to do the same ordered extent accounting twice
+ * on one page.
+ *
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
+ wait_subpage_spinlock(page);
- if (offset) {
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full page, we don't need to and
+ * shouldn't clear page extent mapped, as page->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that can invalidate the full even the range doesn't
+ * cover the full page, like invalidating the last page, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == PAGE_SIZE)) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
@@ -8398,89 +8581,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
- start = page_start;
-again:
- ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
- if (ordered) {
- found_ordered = true;
- end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ bool delete_states;
+ u64 range_end;
+ u32 range_len;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ delete_states = true;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ delete_states = true;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ delete_states = false;
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+
/*
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, start, end,
+ clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
+
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
+
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ delete_states = true;
+ } else {
+ /*
+ * btrfs_finish_ordered_io() will get executed by endio
+ * of other pages, thus we can't delete extent states
+ * anymore
+ */
+ delete_states = false;
+ }
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
/*
- * whoever cleared the private bit is responsible
- * for the finish_ordered_io
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (TestClearPagePrivate2(page)) {
- spin_lock_irq(&inode->ordered_tree.lock);
- set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- ordered->truncated_len = min(ordered->truncated_len,
- start - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
-
- if (btrfs_dec_test_ordered_pending(inode, &ordered,
- start,
- end - start + 1, 1)) {
- btrfs_finish_ordered_io(ordered);
- completed_ordered = true;
- }
- }
- btrfs_put_ordered_extent(ordered);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
- cached_state = NULL;
- lock_extent_bits(tree, start, end,
- &cached_state);
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete_states, &cached_state);
}
-
- start = end + 1;
- if (start < page_end)
- goto again;
+ cur = range_end + 1;
}
-
/*
- * Qgroup reserved space handler
- * Page here will be either
- * 1) Already written to disk or ordered extent already submitted
- * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
- * Qgroup will be handled by its qgroup_record then.
- * btrfs_qgroup_free_data() call will do nothing here.
- *
- * 2) Not written to disk yet
- * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
- * bit of its io_tree, and free the qgroup reserved data space.
- * Since the IO will never happen for this page.
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
*/
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
- if (!inode_evicting) {
- bool delete = true;
-
- /*
- * If there's an ordered extent for this range and we have not
- * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
- * in the range for the ordered extent completion. We must also
- * not delete the range, otherwise we would lose that bit (and
- * any other bits set in the range). Make sure EXTENT_UPTODATE
- * is cleared if we don't delete, otherwise it can lead to
- * corruptions if the i_size is extented later.
- */
- if (found_ordered && !completed_ordered)
- delete = false;
- clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete, &cached_state);
-
+ ASSERT(!PageOrdered(page));
+ if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- }
-
ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8626,8 +8843,8 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
- SetPageUptodate(page);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8661,6 +8878,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8718,20 +8936,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
min_size, false);
BUG_ON(ret);
- /*
- * So if we truncate and then write and fsync we normally would just
- * write the extents that changed, which is a problem if we need to
- * first truncate that entire inode. So set this flag so we write out
- * all of the extents in the inode to the sync log so we're completely
- * safe.
- */
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
trans->block_rsv = rsv;
while (1) {
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ &extents_found);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8793,6 +9004,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
out:
btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (extents_found > 0)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
}
@@ -8802,7 +9029,8 @@ out:
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root)
+ struct btrfs_root *parent_root,
+ struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
@@ -8813,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
if (err < 0)
return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
+ inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
+ ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
@@ -8857,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
+ ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
@@ -9038,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
+ u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
@@ -9050,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (bi_flags & BTRFS_INODE_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
@@ -9090,8 +9323,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
bool dest_log_pinned = false;
bool need_abort = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
@@ -9138,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9156,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
@@ -9188,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9269,8 +9527,7 @@ out_fail:
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
+ btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
@@ -9294,6 +9551,7 @@ out_notrans:
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
+ struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
@@ -9306,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- inode = btrfs_new_inode(trans, root, dir,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
@@ -9343,9 +9601,10 @@ out:
return ret;
}
-static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int btrfs_rename(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
@@ -9440,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9465,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
@@ -9518,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- ret = btrfs_whiteout_for_rename(trans, root, old_dir,
- old_dentry);
+ ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
+ old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9569,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
- return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+ new_dentry, flags);
}
struct btrfs_delalloc_work {
@@ -9666,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = sync_inode(inode, wbc);
- if (!ret &&
- test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = sync_inode(inode, wbc);
+ ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
@@ -9805,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
goto out_unlock;
- inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
- objectid, S_IFLNK|S_IRWXUGO, &index);
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir,
+ dentry->d_name.name, dentry->d_name.len,
+ btrfs_ino(BTRFS_I(dir)), objectid,
+ S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
@@ -10131,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns,
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
- return generic_permission(&init_user_ns, inode, mask);
+ return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
@@ -10156,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (ret)
goto out;
- inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -10199,17 +10473,21 @@ out:
return ret;
}
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
+ u32 len;
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}