diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 940 |
1 files changed, 609 insertions, 331 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 46f392943f4d..487533c35ddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> +#include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -51,6 +52,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" struct btrfs_iget_args { u64 ino; @@ -166,22 +168,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *page; while (index <= end_index) { + /* + * For locked page, we will call end_extent_writepage() on it + * in run_delalloc_range() for the error handling. That + * end_extent_writepage() function will call + * btrfs_mark_ordered_io_finished() to clear page Ordered and + * run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + index++; + continue; + } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; - ClearPagePrivate2(page); + + /* + * Here we just clear all Ordered bits for every page in the + * range, then __endio_write_update_ordered() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); put_page(page); } + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) + return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - offset += PAGE_SIZE; - bytes -= PAGE_SIZE; + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); @@ -260,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = page_address(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -464,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { + /* Subpage doesn't support compression yet */ + if (inode->root->fs_info->sectorsize < PAGE_SIZE) + return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -656,7 +685,11 @@ again: } } cont: - if (start == 0) { + /* + * Check cow_file_range() for why we don't even try to create inline + * extent for subpage case. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ if (ret || total_in < actual_end) { /* we didn't compress the entire range, try @@ -946,7 +979,8 @@ retry: const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(p, start, end, 0); + btrfs_writepage_endio_finish_ordered(inode, p, start, + end, false); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1053,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (start == 0) { + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); @@ -1064,7 +1108,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(inode, start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1072,6 +1117,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't use page_started to determine if it's an + * inline extent or a compressed extent. + */ + unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; @@ -1150,15 +1208,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits * - * Do set the Private2 bit so we know this page was properly - * setup for writepage + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; - page_ops |= PAGE_SET_PRIVATE2; + page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, @@ -1248,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* atomic_sub_return implies a barrier */ - if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M) - cond_wake_up_nomb(&fs_info->async_submit_wait); - /* * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to @@ -1261,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) */ if (async_chunk->inode) submit_compressed_extents(async_chunk); + + /* atomic_sub_return implies a barrier */ + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); } static noinline void async_cow_free(struct btrfs_work *work) @@ -1822,7 +1881,7 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_PRIVATE2); + PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; @@ -1904,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } + ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -2193,26 +2253,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; - u64 length = 0; - u64 map_length; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_iter.bi_size; - map_length = length; - em = btrfs_get_chunk_map(fs_info, logical, map_length); + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, - map_length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; - if (geom.len < length + size) + if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); @@ -2233,29 +2289,117 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size) +/* + * Split an extent_map at [start, start + len] + * + * This function is intended to be used only for extract_ordered_extent(). + */ +static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, + u64 pre, u64 post) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered; - u64 len = bio->bi_iter.bi_size + size; - bool ret = true; + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; + struct extent_map *split_post = NULL; + int ret = 0; + unsigned long flags; - ASSERT(btrfs_is_zoned(fs_info)); - ASSERT(fs_info->max_zone_append_size > 0); - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); + /* Sanity check */ + if (pre == 0 && post == 0) + return 0; - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); - if (!ordered) - return ret; + split_pre = alloc_extent_map(); + if (pre) + split_mid = alloc_extent_map(); + if (post) + split_post = alloc_extent_map(); + if (!split_pre || (pre && !split_mid) || (post && !split_post)) { + ret = -ENOMEM; + goto out; + } - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > - ordered->disk_bytenr + ordered->disk_num_bytes) - ret = false; + ASSERT(pre + post < len); - btrfs_put_ordered_extent(ordered); + lock_extent(&inode->io_tree, start, start + len - 1); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + ret = -EIO; + goto out_unlock; + } + + ASSERT(em->len == len); + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; + split_pre->len = (pre ? pre : em->len - post); + split_pre->orig_start = split_pre->start; + split_pre->block_start = em->block_start; + split_pre->block_len = split_pre->len; + split_pre->orig_block_len = split_pre->block_len; + split_pre->ram_bytes = split_pre->len; + split_pre->flags = flags; + split_pre->compress_type = em->compress_type; + split_pre->generation = em->generation; + + replace_extent_mapping(em_tree, em, split_pre, 1); + + /* + * Now we only have an extent_map at: + * [em->start, em->start + pre] if pre != 0 + * [em->start, em->start + em->len - post] if pre == 0 + */ + + if (pre) { + /* Insert the middle extent_map */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre - post; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); + } + + if (post) { + split_post->start = em->start + em->len - post; + split_post->len = post; + split_post->orig_start = split_post->start; + split_post->block_start = em->block_start + em->len - post; + split_post->block_len = split_post->len; + split_post->orig_block_len = split_post->block_len; + split_post->ram_bytes = split_post->len; + split_post->flags = flags; + split_post->compress_type = em->compress_type; + split_post->generation = em->generation; + add_extent_mapping(em_tree, split_post, 1); + } + + /* Once for us */ + free_extent_map(em); + /* Once for the tree */ + free_extent_map(em); + +out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1); +out: + free_extent_map(split_pre); + free_extent_map(split_mid); + free_extent_map(split_post); return ret; } @@ -2264,9 +2408,8 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { struct btrfs_ordered_extent *ordered; - struct extent_map *em = NULL, *em_new = NULL; - struct extent_map_tree *em_tree = &inode->extent_tree; u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 file_len; u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; @@ -2306,41 +2449,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, goto out; } + file_len = ordered->num_bytes; pre = start - ordered->disk_bytenr; post = ordered_end - end; ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, ordered->file_offset, len); - if (!em) { - read_unlock(&em_tree->lock); - ret = -EIO; - goto out; - } - read_unlock(&em_tree->lock); - - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - /* - * We cannot reuse em_new here but have to create a new one, as - * unpin_extent_cache() expects the start of the extent map to be the - * logical offset of the file, which does not hold true anymore after - * splitting. - */ - em_new = create_io_em(inode, em->start + pre, len, - em->start + pre, em->block_start + pre, len, - len, len, BTRFS_COMPRESS_NONE, - BTRFS_ORDERED_REGULAR); - if (IS_ERR(em_new)) { - ret = PTR_ERR(em_new); - goto out; - } - free_extent_map(em_new); + ret = split_zoned_em(inode, file_offset, file_len, pre, post); out: - free_extent_map(em); btrfs_put_ordered_extent(ordered); return errno_to_blk_status(ret); @@ -2601,7 +2719,7 @@ again: lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PagePrivate2(page)) + if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); @@ -2670,14 +2788,14 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) return 0; /* @@ -2773,7 +2891,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the - * number of bytes only for that range contaning the inline extent. + * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ @@ -2892,7 +3010,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (ordered_extent->disk) + if (ordered_extent->bdev) btrfs_rewrite_logical_zoned(ordered_extent); btrfs_free_io_failure_record(inode, start, end); @@ -3069,28 +3187,14 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, - u64 end, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, + u64 end, bool uptodate) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; - - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -3152,46 +3256,73 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) + * + * Return a bitmap where bit set means a csum mismatch, and bit not set means + * csum match. */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; + unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); return 0; } - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + /* + * For subpage case, above PageChecked is not safe as it's not subpage + * compatible. + * But for now only cow fixup and compressed read utilize PageChecked + * flag, while in this context we can easily use io_bio->csum to + * determine if we really need to do csum verification. + * + * So for now, just exit if io_bio->csum is NULL, as it means it's + * compressed read, and its compressed data csum has already been + * verified. + */ + if (io_bio->csum == NULL) return 0; - if (!root->fs_info->csum_root) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); + if (!root->fs_info->csum_root) return 0; - } ASSERT(page_offset(page) <= start && end <= page_offset(page) + PAGE_SIZE - 1); for (pg_off = offset_in_page(start); pg_off < offset_in_page(end); pg_off += sectorsize, bio_offset += sectorsize) { + u64 file_offset = pg_off + page_offset(page); int ret; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM); + continue; + } ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); - if (ret < 0) - return -EIO; + if (ret < 0) { + const int nr_bit = (pg_off - offset_in_page(start)) >> + root->fs_info->sectorsize_bits; + + result |= (1U << nr_bit); + } } - return 0; + return result; } /* @@ -3426,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* * If we have an inode with links, there are a couple of - * possibilities. Old kernels (before v3.12) used to create an + * possibilities: + * + * 1. We were halfway through creating fsverity metadata for the + * file. In that case, the orphan item represents incomplete + * fsverity metadata which must be cleaned up with + * btrfs_drop_verity_items and deleting the orphan item. + + * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent @@ -3444,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * but either way, we can delete the orphan item. */ if (ret == -ENOENT || inode->i_nlink) { - if (!ret) + if (!ret) { + ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + if (ret) + goto out; + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3634,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode, rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* @@ -3765,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3800,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } @@ -4109,7 +4255,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. @@ -4464,20 +4610,36 @@ out: #define NEED_TRUNCATE_BLOCK 1 /* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size + * Remove inode items from a given root. + * + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @new_size: The new i_size for the inode. This is only applicable when + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. + * @min_type: The minimum key type to remove. All keys with a type + * greater than this value are removed and all keys with + * this type are removed only if their offset is >= @new_size. + * @extents_found: Output parameter that will contain the number of file + * extent items that were removed or adjusted to the new + * inode i_size. The caller is responsible for initializing + * the counter. Also, it can be NULL if the caller does not + * need this counter. * - * csum items that cross the new i_size are truncated to the new size - * as well. + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - u64 new_size, u32 min_type) + u64 new_size, u32 min_type, + u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -4623,6 +4785,9 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (extents_found != NULL) + (*extents_found)++; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; @@ -4941,7 +5106,7 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -4975,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, int ret; /* - * Still need to make sure the inode looks like it's been updated so - * that any holes get logged if we fsync. + * If NO_HOLES is enabled, we don't need to do anything. + * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() + * or btrfs_update_inode() will be called, which guarantee that the next + * fsync will know this inode was changed and needs to be logged. */ - if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - inode->last_trans = fs_info->generation; - inode->last_sub_trans = root->log_transid; - inode->last_log_commit = root->last_log_commit; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; - } /* * 1 - for the one we're dropping @@ -5229,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -5240,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(&init_user_ns, inode, - inode->i_mode); + err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); } return err; @@ -5409,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); if (!root) { + fsverity_cleanup_inode(inode); clear_inode(inode); return; } @@ -5455,7 +5618,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0); + 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -5491,6 +5654,7 @@ no_delete: * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); + fsverity_cleanup_inode(inode); clear_inode(inode); } @@ -6257,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, const char *name, int name_len, u64 ref_objectid, u64 objectid, @@ -6366,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail_unlock; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); @@ -6551,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6615,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6760,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_fail; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -7937,19 +8103,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, btrfs_ino(BTRFS_I(inode)), pgoff); } else { - blk_status_t status; + int ret; ASSERT((start - io_bio->logical) < UINT_MAX); - status = btrfs_submit_read_repair(inode, - &io_bio->bio, - start - io_bio->logical, - bvec.bv_page, pgoff, - start, - start + sectorsize - 1, - io_bio->mirror_num, - submit_dio_repair_bio); - if (status) - err = status; + ret = btrfs_repair_one_sector(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, io_bio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); @@ -7964,41 +8128,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered->work); - } - - /* No ordered extent found in the range, exit */ - if (ordered_offset == last_offset) - return; - /* - * Our bio might span multiple ordered extents. In this case - * we keep going until we have accounted the whole dio. - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - } - } + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, + finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -8116,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, +static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & @@ -8128,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 start_sector; int async_submit = 0; u64 submit_len; - int clone_offset = 0; - int clone_len; + u64 clone_offset = 0; + u64 clone_len; u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; + struct btrfs_dio_data *dio_data = iter->iomap.private; struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); @@ -8172,14 +8304,14 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, submit_len, &geom); + logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; } - ASSERT(geom.len <= INT_MAX); - clone_len = min_t(int, submit_len, geom.len); + clone_len = min(submit_len, geom.len); + ASSERT(clone_len <= UINT_MAX); /* * This will never fail as it's passing GPF_NOFS and @@ -8276,15 +8408,14 @@ int btrfs_readpage(struct file *file, struct page *page) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - unsigned long bio_flags = 0; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + if (bio_ctrl.bio) + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } @@ -8324,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac) extent_readahead(rac); } +/* + * For releasepage() and invalidatepage() we have a race window where + * end_page_writeback() is called but the subpage spinlock is not yet released. + * If we continue to release/invalidate the page, we could cause use-after-free + * for subpage spinlock. So this function is to spin and wait for subpage + * spinlock. + */ +static void wait_subpage_spinlock(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * This may look insane as we just acquire the spinlock and release it, + * without doing anything. But we just want to make sure no one is + * still holding the subpage spinlock. + * And since the page is not dirty nor writeback, and we have page + * locked, the only possible way to hold a spinlock is from the endio + * function to clear page writeback. + * + * Here we just acquire the spinlock so that all existing callers + * should exit and we're safe to release/invalidate the page. + */ + spin_lock_irq(&subpage->lock); + spin_unlock_irq(&subpage->lock); +} + static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) + + if (ret == 1) { + wait_subpage_spinlock(page); clear_page_extent_mapped(page); + } return ret; } @@ -8353,9 +8520,9 @@ static int btrfs_migratepage(struct address_space *mapping, if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); - if (PagePrivate2(page)) { - ClearPagePrivate2(page); - SetPagePrivate2(newpage); + if (PageOrdered(page)) { + ClearPageOrdered(page); + SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -8370,27 +8537,43 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. + * We have page locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this page. * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting + * But already submitted bio can still be finished on this page. + * Furthermore, endio function won't skip page which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidatepage to do the same ordered extent accounting twice + * on one page. + * + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); + wait_subpage_spinlock(page); - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } @@ -8398,89 +8581,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states; + u64 range_end; + u32 range_len; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + delete_states = true; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + delete_states = false; + goto next; + } + btrfs_page_clear_ordered(fs_info, page, cur, range_len); + /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, start, end, + clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + delete_states = true; + } else { + /* + * btrfs_finish_ordered_io() will get executed by endio + * of other pages, thus we can't delete extent states + * anymore + */ + delete_states = false; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. */ - if (TestClearPagePrivate2(page)) { - spin_lock_irq(&inode->ordered_tree.lock); - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, - start - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); - - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, - end - start + 1, 1)) { - btrfs_finish_ordered_io(ordered); - completed_ordered = true; - } - } - btrfs_put_ordered_extent(ordered); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { - cached_state = NULL; - lock_extent_bits(tree, start, end, - &cached_state); + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete_states, &cached_state); } - - start = end + 1; - if (start < page_end) - goto again; + cur = range_end + 1; } - /* - * Qgroup reserved space handler - * Page here will be either - * 1) Already written to disk or ordered extent already submitted - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. - * Qgroup will be handled by its qgroup_record then. - * btrfs_qgroup_free_data() call will do nothing here. - * - * 2) Not written to disk yet - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED - * bit of its io_tree, and free the qgroup reserved data space. - * Since the IO will never happen for this page. + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); - if (!inode_evicting) { - bool delete = true; - - /* - * If there's an ordered extent for this range and we have not - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set - * in the range for the ordered extent completion. We must also - * not delete the range, otherwise we would lose that bit (and - * any other bits set in the range). Make sure EXTENT_UPTODATE - * is cleared if we don't delete, otherwise it can lead to - * corruptions if the i_size is extented later. - */ - if (found_ordered && !completed_ordered) - delete = false; - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete, &cached_state); - + ASSERT(!PageOrdered(page)); + if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - } - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8626,8 +8843,8 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8661,6 +8878,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8718,20 +8936,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) min_size, false); BUG_ON(ret); - /* - * So if we truncate and then write and fsync we normally would just - * write the extents that changed, which is a problem if we need to - * first truncate that entire inode. So set this flag so we write out - * all of the extents in the inode to the sync log so we're completely - * safe. - */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8793,6 +9004,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } out: btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (extents_found > 0) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } @@ -8802,7 +9029,8 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root) + struct btrfs_root *parent_root, + struct user_namespace *mnt_userns) { struct inode *inode; int err; @@ -8813,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, if (err < 0) return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, + inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, + ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) @@ -8857,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; @@ -9038,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; + u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; @@ -9050,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + if (bi_ro_flags & BTRFS_INODE_RO_VERITY) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9090,8 +9323,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, bool dest_log_pinned = false; bool need_abort = false; - /* we only allow rename subvolume link between subvolumes */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + /* + * For non-subvolumes allow exchange only within one subvolume, in the + * same inode namespace. Two subvolumes (represented as directory) can + * be exchanged as they're a logical link and have a fixed inode number. + */ + if (root != dest && + (old_ino != BTRFS_FIRST_FREE_OBJECTID || + new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; /* close the race window with snapshot create/destroy ioctl */ @@ -9138,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - root_log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9156,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, @@ -9188,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + * + * We pin the logs even if at this precise moment none of the inodes was + * logged before. This is because right after we checked for that, some + * other task fsyncing some other inode not involved with this rename + * operation could log that one of our inodes exists. + * + * We don't need to pin the logs before the above calls to + * btrfs_insert_inode_ref(), since those don't ever need to change a log. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(root); + root_log_pinned = true; + } + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + } + /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9269,8 +9527,7 @@ out_fail: if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) + btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) btrfs_set_log_full_commit(trans); if (root_log_pinned) { @@ -9294,6 +9551,7 @@ out_notrans: static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry) { @@ -9306,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) return ret; - inode = btrfs_new_inode(trans, root, dir, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), @@ -9343,9 +9601,10 @@ out: return ret; } -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int btrfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; @@ -9440,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9465,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { + /* + * Now pin the log. We do it to ensure that no other task can + * sync the log while we are in progress with the rename, as + * that could result in an inconsistency in case any of the + * inodes that are part of this rename operation were logged + * before. + * + * We pin the log even if at this precise moment none of the + * inodes was logged before. This is because right after we + * checked for that, some other task fsyncing some other inode + * not involved with this rename operation could log that one of + * our inodes exists. + * + * We don't need to pin the logs before the above call to + * btrfs_insert_inode_ref(), since that does not need to change + * a log. + */ + btrfs_pin_log_trans(root); + log_pinned = true; ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, @@ -9518,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, old_dir, - old_dentry); + ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, + old_dir, old_dentry); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9569,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di return btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); } struct btrfs_delalloc_work { @@ -9666,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = sync_inode(inode, wbc); - if (!ret && - test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = sync_inode(inode, wbc); + ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; @@ -9805,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), - objectid, S_IFLNK|S_IRWXUGO, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, + S_IFLNK | S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -10131,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(mnt_userns, inode, mask); } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, @@ -10156,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (ret) goto out; - inode = btrfs_new_inode(trans, root, dir, NULL, 0, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -10199,17 +10473,21 @@ out: return ret; } -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; + u32 len; + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); + + btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } |