aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c517
1 files changed, 284 insertions, 233 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 46f392943f4d..e6eb20987351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -51,6 +51,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
struct btrfs_iget_args {
u64 ino;
@@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *page;
while (index <= end_index) {
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
- ClearPagePrivate2(page);
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then __endio_write_update_ordered() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
put_page(page);
}
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
+ return;
/*
* In case this page belongs to the delalloc range being instantiated
* then skip it, since the first page of a range is going to be
* properly cleaned up by the caller of run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- offset += PAGE_SIZE;
- bytes -= PAGE_SIZE;
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
}
return __endio_write_update_ordered(inode, offset, bytes, false);
@@ -603,7 +629,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -946,7 +972,8 @@ retry:
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+ btrfs_writepage_endio_finish_ordered(inode, p, start,
+ end, 0);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
@@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /* we're not doing compressed IO, don't unlock the first
- * page (which the caller expects to stay locked), don't
- * clear any dirty bits and don't set any writeback bits
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
*
- * Do set the Private2 bit so we know this page was properly
- * setup for writepage
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
- page_ops |= PAGE_SET_PRIVATE2;
+ page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
@@ -1822,7 +1864,7 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
@@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ u32 bio_len = bio->bi_iter.bi_size;
struct extent_map *em;
- u64 length = 0;
- u64 map_length;
int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_iter.bi_size;
- map_length = length;
- em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
- map_length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
if (ret < 0)
goto out;
- if (geom.len < length + size)
+ if (geom.len < bio_len + size)
ret = 1;
out:
free_extent_map(em);
@@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered;
- u64 len = bio->bi_iter.bi_size + size;
- bool ret = true;
-
- ASSERT(btrfs_is_zoned(fs_info));
- ASSERT(fs_info->max_zone_append_size > 0);
- ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
-
- /* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
- if (!ordered)
- return ret;
-
- if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
- ordered->disk_bytenr + ordered->disk_num_bytes)
- ret = false;
-
- btrfs_put_ordered_extent(ordered);
-
- return ret;
-}
-
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
@@ -2601,7 +2612,7 @@ again:
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PagePrivate2(page))
+ if (PageOrdered(page))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
@@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- /* this page is properly in the ordered list */
- if (TestClearPagePrivate2(page))
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
return 0;
/*
@@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
- * number of bytes only for that range contaning the inline extent.
+ * number of bytes only for that range containing the inline extent.
* The remaining of the range will be processed when clearning the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
@@ -3069,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
u64 end, int uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *wq;
-
- trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
-
- ClearPagePrivate2(page);
- if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1, uptodate))
- return;
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered_extent->work);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
+ finish_ordered_fn, uptodate);
}
/*
@@ -3152,15 +3149,19 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
+ unsigned int result = 0;
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -3188,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
- if (ret < 0)
- return -EIO;
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
}
- return 0;
+ return result;
}
/*
@@ -4109,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
- * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
@@ -4464,20 +4469,36 @@ out:
#define NEED_TRUNCATE_BLOCK 1
/*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
+ * Remove inode items from a given root.
*
- * csum items that cross the new i_size are truncated to the new size
- * as well.
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @new_size: The new i_size for the inode. This is only applicable when
+ * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
+ * @min_type: The minimum key type to remove. All keys with a type
+ * greater than this value are removed and all keys with
+ * this type are removed only if their offset is >= @new_size.
+ * @extents_found: Output parameter that will contain the number of file
+ * extent items that were removed or adjusted to the new
+ * inode i_size. The caller is responsible for initializing
+ * the counter. Also, it can be NULL if the caller does not
+ * need this counter.
*
- * min_type is the minimum key type to truncate down to. If set to 0, this
- * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- u64 new_size, u32 min_type)
+ u64 new_size, u32 min_type,
+ u64 *extents_found)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -4623,6 +4644,9 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
+ if (extents_found != NULL)
+ (*extents_found)++;
+
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
@@ -4941,7 +4965,7 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -5455,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0);
+ 0, 0, NULL);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -7937,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
btrfs_ino(BTRFS_I(inode)),
pgoff);
} else {
- blk_status_t status;
+ int ret;
ASSERT((start - io_bio->logical) < UINT_MAX);
- status = btrfs_submit_read_repair(inode,
- &io_bio->bio,
- start - io_bio->logical,
- bvec.bv_page, pgoff,
- start,
- start + sectorsize - 1,
- io_bio->mirror_num,
- submit_dio_repair_bio);
- if (status)
- err = status;
+ ret = btrfs_repair_one_sector(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start, io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
@@ -7964,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered = NULL;
- struct btrfs_workqueue *wq;
- u64 ordered_offset = offset;
- u64 ordered_bytes = bytes;
- u64 last_offset;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- while (ordered_offset < offset + bytes) {
- last_offset = ordered_offset;
- if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
- NULL);
- btrfs_queue_work(wq, &ordered->work);
- }
-
- /* No ordered extent found in the range, exit */
- if (ordered_offset == last_offset)
- return;
- /*
- * Our bio might span multiple ordered extents. In this case
- * we keep going until we have accounted the whole dio.
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- }
- }
+ btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
+ finish_ordered_fn, uptodate);
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -8172,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err_em;
}
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
- logical, submit_len, &geom);
+ logical, &geom);
if (ret) {
status = errno_to_blk_status(ret);
goto out_err_em;
@@ -8276,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page)
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- unsigned long bio_flags = 0;
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
- if (bio)
- ret = submit_one_bio(bio, 0, bio_flags);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ if (bio_ctrl.bio)
+ ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
return ret;
}
@@ -8353,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping,
if (page_has_private(page))
attach_page_private(newpage, detach_page_private(page));
- if (PagePrivate2(page)) {
- ClearPagePrivate2(page);
- SetPagePrivate2(newpage);
+ if (PageOrdered(page)) {
+ ClearPageOrdered(page);
+ SetPageOrdered(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -8370,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
- u64 start;
- u64 end;
+ u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
- bool found_ordered = false;
- bool completed_ordered = false;
/*
- * we have the page locked, so new writeback can't start,
- * and the dirty bit won't be cleared while we are here.
+ * We have page locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this page.
+ *
+ * But already submitted bio can still be finished on this page.
+ * Furthermore, endio function won't skip page which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidatepage to do the same ordered extent accounting twice
+ * on one page.
*
- * Wait for IO on this page so that we can safely clear
- * the PagePrivate2 bit and do ordered accounting
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
- if (offset) {
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full page, we don't need to and
+ * shouldn't clear page extent mapped, as page->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that can invalidate the full even the range doesn't
+ * cover the full page, like invalidating the last page, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == PAGE_SIZE)) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
@@ -8398,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
- start = page_start;
-again:
- ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
- if (ordered) {
- found_ordered = true;
- end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ bool delete_states;
+ u64 range_end;
+ u32 range_len;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ delete_states = true;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ delete_states = true;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ delete_states = false;
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+
/*
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, start, end,
+ clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
+
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
+
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur, 1)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ delete_states = true;
+ } else {
+ /*
+ * btrfs_finish_ordered_io() will get executed by endio
+ * of other pages, thus we can't delete extent states
+ * anymore
+ */
+ delete_states = false;
+ }
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
/*
- * whoever cleared the private bit is responsible
- * for the finish_ordered_io
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (TestClearPagePrivate2(page)) {
- spin_lock_irq(&inode->ordered_tree.lock);
- set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- ordered->truncated_len = min(ordered->truncated_len,
- start - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
-
- if (btrfs_dec_test_ordered_pending(inode, &ordered,
- start,
- end - start + 1, 1)) {
- btrfs_finish_ordered_io(ordered);
- completed_ordered = true;
- }
- }
- btrfs_put_ordered_extent(ordered);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
- cached_state = NULL;
- lock_extent_bits(tree, start, end,
- &cached_state);
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete_states, &cached_state);
}
-
- start = end + 1;
- if (start < page_end)
- goto again;
+ cur = range_end + 1;
}
-
/*
- * Qgroup reserved space handler
- * Page here will be either
- * 1) Already written to disk or ordered extent already submitted
- * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
- * Qgroup will be handled by its qgroup_record then.
- * btrfs_qgroup_free_data() call will do nothing here.
- *
- * 2) Not written to disk yet
- * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
- * bit of its io_tree, and free the qgroup reserved data space.
- * Since the IO will never happen for this page.
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
*/
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
- if (!inode_evicting) {
- bool delete = true;
-
- /*
- * If there's an ordered extent for this range and we have not
- * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
- * in the range for the ordered extent completion. We must also
- * not delete the range, otherwise we would lose that bit (and
- * any other bits set in the range). Make sure EXTENT_UPTODATE
- * is cleared if we don't delete, otherwise it can lead to
- * corruptions if the i_size is extented later.
- */
- if (found_ordered && !completed_ordered)
- delete = false;
- clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete, &cached_state);
-
+ ASSERT(!PageOrdered(page));
+ if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- }
-
ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8626,8 +8663,8 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
- SetPageUptodate(page);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8661,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8718,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
min_size, false);
BUG_ON(ret);
- /*
- * So if we truncate and then write and fsync we normally would just
- * write the extents that changed, which is a problem if we need to
- * first truncate that entire inode. So set this flag so we write out
- * all of the extents in the inode to the sync log so we're completely
- * safe.
- */
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
trans->block_rsv = rsv;
while (1) {
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ &extents_found);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8793,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
out:
btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (extents_found > 0)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
}
@@ -10199,17 +10246,21 @@ out:
return ret;
}
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
+ u32 len;
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}