From 2280d425ba3599bdd85c41bd0ec8ba568f00c032 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 28 Mar 2023 10:45:20 +0100 Subject: btrfs: ignore fiemap path cache when there are multiple paths for a node During fiemap, when walking backreferences to determine if a b+tree node/leaf is shared, we may find a tree block (leaf or node) for which two parents were added to the references ulist. This happens if we get for example one direct ref (shared tree block ref) and one indirect ref (non-shared tree block ref) for the tree block at the current level, which can happen during relocation. In that case the fiemap path cache can not be used since it's meant for a single path, with one tree block at each possible level, so having multiple references for a tree block at any level may result in getting the level counter exceed BTRFS_MAX_LEVEL and eventually trigger the warning: WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL) at lookup_backref_shared_cache() and at store_backref_shared_cache(). This is harmless since the code ignores any level >= BTRFS_MAX_LEVEL, the warning is there just to catch any unexpected case like the one described above. However if a user finds this it may be scary and get reported. So just ignore the path cache once we find a tree block for which there are more than one reference, which is the less common case, and update the cache with the sharedness check result for all levels below the level for which we found multiple references. Reported-by: Jarno Pelkonen Link: https://lore.kernel.org/linux-btrfs/CAKv8qLmDNAGJGCtsevxx_VZ_YOvvs1L83iEJkTgyA4joJertng@mail.gmail.com/ Fixes: 12a824dc67a6 ("btrfs: speedup checking for extent sharedness during fiemap") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 85 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/backref.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 90e40d5ceccd..e54f0884802a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, level = -1; ULIST_ITER_INIT(&uiter); while (1) { - bool is_shared; - bool cached; + const unsigned long prev_ref_count = ctx->refs.nnodes; walk_ctx.bytenr = bytenr; ret = find_parent_nodes(&walk_ctx, &shared); @@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ret = 0; /* - * If our data extent was not directly shared (without multiple - * reference items), than it might have a single reference item - * with a count > 1 for the same offset, which means there are 2 - * (or more) file extent items that point to the data extent - - * this happens when a file extent item needs to be split and - * then one item gets moved to another leaf due to a b+tree leaf - * split when inserting some item. In this case the file extent - * items may be located in different leaves and therefore some - * of the leaves may be referenced through shared subtrees while - * others are not. Since our extent buffer cache only works for - * a single path (by far the most common case and simpler to - * deal with), we can not use it if we have multiple leaves - * (which implies multiple paths). + * More than one extent buffer (bytenr) may have been added to + * the ctx->refs ulist, in which case we have to check multiple + * tree paths in case the first one is not shared, so we can not + * use the path cache which is made for a single path. Multiple + * extent buffers at the current level happen when: + * + * 1) level -1, the data extent: If our data extent was not + * directly shared (without multiple reference items), then + * it might have a single reference item with a count > 1 for + * the same offset, which means there are 2 (or more) file + * extent items that point to the data extent - this happens + * when a file extent item needs to be split and then one + * item gets moved to another leaf due to a b+tree leaf split + * when inserting some item. In this case the file extent + * items may be located in different leaves and therefore + * some of the leaves may be referenced through shared + * subtrees while others are not. Since our extent buffer + * cache only works for a single path (by far the most common + * case and simpler to deal with), we can not use it if we + * have multiple leaves (which implies multiple paths). + * + * 2) level >= 0, a tree node/leaf: We can have a mix of direct + * and indirect references on a b+tree node/leaf, so we have + * to check multiple paths, and the extent buffer (the + * current bytenr) may be shared or not. One example is + * during relocation as we may get a shared tree block ref + * (direct ref) and a non-shared tree block ref (indirect + * ref) for the same node/leaf. */ - if (level == -1 && ctx->refs.nnodes > 1) + if ((ctx->refs.nnodes - prev_ref_count) > 1) ctx->use_path_cache = false; if (level >= 0) @@ -1964,18 +1978,45 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, if (!node) break; bytenr = node->val; - level++; - cached = lookup_backref_shared_cache(ctx, root, bytenr, level, - &is_shared); - if (cached) { - ret = (is_shared ? 1 : 0); - break; + if (ctx->use_path_cache) { + bool is_shared; + bool cached; + + level++; + cached = lookup_backref_shared_cache(ctx, root, bytenr, + level, &is_shared); + if (cached) { + ret = (is_shared ? 1 : 0); + break; + } } shared.share_count = 0; shared.have_delayed_delete_refs = false; cond_resched(); } + /* + * If the path cache is disabled, then it means at some tree level we + * got multiple parents due to a mix of direct and indirect backrefs or + * multiple leaves with file extent items pointing to the same data + * extent. We have to invalidate the cache and cache only the sharedness + * result for the levels where we got only one node/reference. + */ + if (!ctx->use_path_cache) { + int i = 0; + + level--; + if (ret >= 0 && level >= 0) { + bytenr = ctx->path_cache_entries[level].bytenr; + ctx->use_path_cache = true; + store_backref_shared_cache(ctx, root, bytenr, level, ret); + i = level + 1; + } + + for ( ; i < BTRFS_MAX_LEVEL; i++) + ctx->path_cache_entries[i].bytenr = 0; + } + /* * Cache the sharedness result for the data extent if we know our inode * has more than 1 file extent item that refers to the data extent. -- cgit From 0cad8f14d70cfeb5173dce93cafeba665a95430e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 9 May 2023 12:50:02 +0100 Subject: btrfs: fix backref walking not returning all inode refs When using the logical to ino ioctl v2, if the flag to ignore offsets of file extent items (BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET) is given, the backref walking code ends up not returning references for all file offsets of an inode that point to the given logical bytenr. This happens since kernel 6.2, commit 6ce6ba534418 ("btrfs: use a single argument for extent offset in backref walking functions") because: 1) It mistakenly skipped the search for file extent items in a leaf that point to the target extent if that flag is given. Instead it should only skip the filtering done by check_extent_in_eb() - that is, it should not avoid the calls to that function (or find_extent_in_eb(), which uses it). 2) It was also not building a list of inode extent elements (struct extent_inode_elem) if we have multiple inode references for an extent when the ignore offset flag is given to the logical to ino ioctl - it would leave a single element, only the last one that was found. These stem from the confusing old interface for backref walking functions where we had an extent item offset argument that was a pointer to a u64 and another boolean argument that indicated if the offset should be ignored, but the pointer could be NULL. That NULL case is used by relocation, qgroup extent accounting and fiemap, simply to avoid building the inode extent list for each reference, as it's not necessary for those use cases and therefore avoids memory allocations and some computations. Fix this by adding a boolean argument to the backref walk context structure to indicate that the inode extent list should not be built, make relocation set that argument to true and fix the backref walking logic to skip the calls to check_extent_in_eb() and find_extent_in_eb() only if this new argument is true, instead of 'ignore_extent_item_pos' being true. A test case for fstests will be added soon, to provide cover not only for these cases but to the logical to ino ioctl in general as well, as currently we do not have a test case for it. Reported-by: Vladimir Panteleev Link: https://lore.kernel.org/linux-btrfs/CAHhfkvwo=nmzrJSqZ2qMfF-rZB-ab6ahHnCD_sq9h4o8v+M7QQ@mail.gmail.com/ Fixes: 6ce6ba534418 ("btrfs: use a single argument for extent offset in backref walking functions") CC: stable@vger.kernel.org # 6.2+ Tested-by: Vladimir Panteleev Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 19 ++++++++++--------- fs/btrfs/backref.h | 6 ++++++ fs/btrfs/relocation.c | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/backref.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e54f0884802a..79336fa853db 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -45,7 +45,8 @@ static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, int root_count; bool cached; - if (!btrfs_file_extent_compression(eb, fi) && + if (!ctx->ignore_extent_item_pos && + !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -552,7 +553,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, count++; else goto next; - if (!ctx->ignore_extent_item_pos) { + if (!ctx->skip_inode_ref_list) { ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) @@ -564,7 +565,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && !ctx->ignore_extent_item_pos) { + if (!ret && !ctx->skip_inode_ref_list) { while (old->next) old = old->next; old->next = eie; @@ -1606,7 +1607,7 @@ again: goto out; } if (ref->count && ref->parent) { - if (!ctx->ignore_extent_item_pos && !ref->inode_list && + if (!ctx->skip_inode_ref_list && !ref->inode_list && ref->level == 0) { struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; @@ -1647,7 +1648,7 @@ again: (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && !ctx->ignore_extent_item_pos) { + if (!ret && !ctx->skip_inode_ref_list) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1743,7 +1744,7 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { const u64 orig_bytenr = ctx->bytenr; - const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + const bool orig_skip_inode_ref_list = ctx->skip_inode_ref_list; bool roots_ulist_allocated = false; struct ulist_iterator uiter; int ret = 0; @@ -1764,7 +1765,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) roots_ulist_allocated = true; } - ctx->ignore_extent_item_pos = true; + ctx->skip_inode_ref_list = true; ULIST_ITER_INIT(&uiter); while (1) { @@ -1789,7 +1790,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) ulist_free(ctx->refs); ctx->refs = NULL; ctx->bytenr = orig_bytenr; - ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + ctx->skip_inode_ref_list = orig_skip_inode_ref_list; return ret; } @@ -1912,7 +1913,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, goto out_trans; } - walk_ctx.ignore_extent_item_pos = true; + walk_ctx.skip_inode_ref_list = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; walk_ctx.refs = &ctx->refs; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ef6bbea3f456..1616e3e3f1e4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -60,6 +60,12 @@ struct btrfs_backref_walk_ctx { * @extent_item_pos is ignored. */ bool ignore_extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then the inode list + * (each member describing inode number, file offset and root) is not + * added to each reference added to the @refs ulist. + */ + bool skip_inode_ref_list; /* A valid transaction handle or NULL. */ struct btrfs_trans_handle *trans; /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 09b1988d1791..59a06499c647 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3422,7 +3422,7 @@ int add_data_references(struct reloc_control *rc, btrfs_release_path(path); ctx.bytenr = extent_key->objectid; - ctx.ignore_extent_item_pos = true; + ctx.skip_inode_ref_list = true; ctx.fs_info = rc->extent_root->fs_info; ret = btrfs_find_all_leafs(&ctx); -- cgit