diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 517 |
1 files changed, 284 insertions, 233 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 46f392943f4d..e6eb20987351 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" struct btrfs_iget_args { u64 ino; @@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *page; while (index <= end_index) { + /* + * For locked page, we will call end_extent_writepage() on it + * in run_delalloc_range() for the error handling. That + * end_extent_writepage() function will call + * btrfs_mark_ordered_io_finished() to clear page Ordered and + * run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + index++; + continue; + } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; - ClearPagePrivate2(page); + + /* + * Here we just clear all Ordered bits for every page in the + * range, then __endio_write_update_ordered() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); put_page(page); } + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) + return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - offset += PAGE_SIZE; - bytes -= PAGE_SIZE; + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); @@ -603,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -946,7 +972,8 @@ retry: const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(p, start, end, 0); + btrfs_writepage_endio_finish_ordered(inode, p, start, + end, 0); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(inode, start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't use page_started to determine if it's an + * inline extent or a compressed extent. + */ + unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; @@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits * - * Do set the Private2 bit so we know this page was properly - * setup for writepage + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; - page_ops |= PAGE_SET_PRIVATE2; + page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, @@ -1822,7 +1864,7 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_PRIVATE2); + PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; @@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; - u64 length = 0; - u64 map_length; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_iter.bi_size; - map_length = length; - em = btrfs_get_chunk_map(fs_info, logical, map_length); + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, - map_length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; - if (geom.len < length + size) + if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); @@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered; - u64 len = bio->bi_iter.bi_size + size; - bool ret = true; - - ASSERT(btrfs_is_zoned(fs_info)); - ASSERT(fs_info->max_zone_append_size > 0); - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); - if (!ordered) - return ret; - - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > - ordered->disk_bytenr + ordered->disk_num_bytes) - ret = false; - - btrfs_put_ordered_extent(ordered); - - return ret; -} - static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { @@ -2601,7 +2612,7 @@ again: lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PagePrivate2(page)) + if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); @@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) return 0; /* @@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the - * number of bytes only for that range contaning the inline extent. + * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ @@ -3069,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; - - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -3152,15 +3149,19 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) + * + * Return a bitmap where bit set means a csum mismatch, and bit not set means + * csum match. */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; + unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); @@ -3188,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); - if (ret < 0) - return -EIO; + if (ret < 0) { + const int nr_bit = (pg_off - offset_in_page(start)) >> + root->fs_info->sectorsize_bits; + + result |= (1U << nr_bit); + } } - return 0; + return result; } /* @@ -4109,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. @@ -4464,20 +4469,36 @@ out: #define NEED_TRUNCATE_BLOCK 1 /* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size + * Remove inode items from a given root. * - * csum items that cross the new i_size are truncated to the new size - * as well. + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @new_size: The new i_size for the inode. This is only applicable when + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. + * @min_type: The minimum key type to remove. All keys with a type + * greater than this value are removed and all keys with + * this type are removed only if their offset is >= @new_size. + * @extents_found: Output parameter that will contain the number of file + * extent items that were removed or adjusted to the new + * inode i_size. The caller is responsible for initializing + * the counter. Also, it can be NULL if the caller does not + * need this counter. * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - u64 new_size, u32 min_type) + u64 new_size, u32 min_type, + u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -4623,6 +4644,9 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (extents_found != NULL) + (*extents_found)++; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; @@ -4941,7 +4965,7 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -5455,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0); + 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -7937,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, btrfs_ino(BTRFS_I(inode)), pgoff); } else { - blk_status_t status; + int ret; ASSERT((start - io_bio->logical) < UINT_MAX); - status = btrfs_submit_read_repair(inode, - &io_bio->bio, - start - io_bio->logical, - bvec.bv_page, pgoff, - start, - start + sectorsize - 1, - io_bio->mirror_num, - submit_dio_repair_bio); - if (status) - err = status; + ret = btrfs_repair_one_sector(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, io_bio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); @@ -7964,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered->work); - } - - /* No ordered extent found in the range, exit */ - if (ordered_offset == last_offset) - return; - /* - * Our bio might span multiple ordered extents. In this case - * we keep going until we have accounted the whole dio. - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - } - } + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, + finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -8172,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, submit_len, &geom); + logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; @@ -8276,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - unsigned long bio_flags = 0; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + if (bio_ctrl.bio) + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } @@ -8353,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping, if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); - if (PagePrivate2(page)) { - ClearPagePrivate2(page); - SetPagePrivate2(newpage); + if (PageOrdered(page)) { + ClearPageOrdered(page); + SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -8370,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. + * We have page locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this page. + * + * But already submitted bio can still be finished on this page. + * Furthermore, endio function won't skip page which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidatepage to do the same ordered extent accounting twice + * on one page. * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } @@ -8398,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states; + u64 range_end; + u32 range_len; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + delete_states = true; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + delete_states = false; + goto next; + } + btrfs_page_clear_ordered(fs_info, page, cur, range_len); + /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, start, end, + clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur, 1)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + delete_states = true; + } else { + /* + * btrfs_finish_ordered_io() will get executed by endio + * of other pages, thus we can't delete extent states + * anymore + */ + delete_states = false; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. */ - if (TestClearPagePrivate2(page)) { - spin_lock_irq(&inode->ordered_tree.lock); - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, - start - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); - - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, - end - start + 1, 1)) { - btrfs_finish_ordered_io(ordered); - completed_ordered = true; - } - } - btrfs_put_ordered_extent(ordered); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { - cached_state = NULL; - lock_extent_bits(tree, start, end, - &cached_state); + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete_states, &cached_state); } - - start = end + 1; - if (start < page_end) - goto again; + cur = range_end + 1; } - /* - * Qgroup reserved space handler - * Page here will be either - * 1) Already written to disk or ordered extent already submitted - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. - * Qgroup will be handled by its qgroup_record then. - * btrfs_qgroup_free_data() call will do nothing here. - * - * 2) Not written to disk yet - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED - * bit of its io_tree, and free the qgroup reserved data space. - * Since the IO will never happen for this page. + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); - if (!inode_evicting) { - bool delete = true; - - /* - * If there's an ordered extent for this range and we have not - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set - * in the range for the ordered extent completion. We must also - * not delete the range, otherwise we would lose that bit (and - * any other bits set in the range). Make sure EXTENT_UPTODATE - * is cleared if we don't delete, otherwise it can lead to - * corruptions if the i_size is extented later. - */ - if (found_ordered && !completed_ordered) - delete = false; - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete, &cached_state); - + ASSERT(!PageOrdered(page)); + if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - } - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8626,8 +8663,8 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8661,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8718,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) min_size, false); BUG_ON(ret); - /* - * So if we truncate and then write and fsync we normally would just - * write the extents that changed, which is a problem if we need to - * first truncate that entire inode. So set this flag so we write out - * all of the extents in the inode to the sync log so we're completely - * safe. - */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8793,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } out: btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (extents_found > 0) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } @@ -10199,17 +10246,21 @@ out: return ret; } -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; + u32 len; + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); + + btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } |