aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2024-04-24 16:58:01 +0100
committerDavid Sterba <dsterba@suse.com>2024-07-11 15:33:17 +0200
commit310b2f5d5a9451b708ab1d3385c3b0998084904c (patch)
treedbce9f0a98c81c6ffde07576f91d83d41441d791
parentbbbee460aaaabe5bea6665dd4ce18677f2d154cc (diff)
btrfs: use an xarray to track open inodes in a root
Currently we use a red black tree (rb-tree) to track the currently open inodes of a root (in struct btrfs_root::inode_tree). This however is not very efficient when the number of inodes is large since rb-trees are binary trees. For example for 100K open inodes, the tree has a depth of 17. Besides that, inserting into the tree requires navigating through it and pulling useless cache lines in the process since the red black tree nodes are embedded within the btrfs inode - on the other hand, by being embedded, it requires no extra memory allocations. We can improve this by using an xarray instead, which is efficient when indices are densely clustered (such as inode numbers), is more cache friendly and behaves like a resizable array, with a much better search and insertion complexity than a red black tree. This only has one small disadvantage which is that insertion will sometimes require allocating memory for the xarray - which may fail (not that often since it uses a kmem_cache) - but on the other hand we can reduce the btrfs inode structure size by 24 bytes (from 1080 down to 1056 bytes) after removing the embedded red black tree node, which after the next patches will allow to reduce the size of the structure to 1024 bytes, meaning we will be able to store 4 inodes per 4K page instead of 3 inodes. This change does a straightforward change to use an xarray, and results in a transaction abort if we can't allocate memory for the xarray when creating an inode - but the next patch changes things so that we don't need to abort. Running the following fs_mark test showed some improvements: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 MOUNT_OPTIONS="-o ssd" FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount $MOUNT_OPTIONS $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before this patch: FSUse% Count Size Files/sec App Overhead 10 1200000 0 92081.6 12505547 16 2400000 0 138222.6 13067072 23 3600000 0 148833.1 13290336 43 4800000 0 97864.7 13931248 53 6000000 0 85597.3 14384313 After this patch: FSUse% Count Size Files/sec App Overhead 10 1200000 0 93225.1 12571078 16 2400000 0 146720.3 12805007 23 3600000 0 160626.4 13073835 46 4800000 0 116286.2 13802927 53 6000000 0 90087.9 14754892 The test was run with a release kernel config (Debian's default config). Also capturing the insertion times into the rb tree and into the xarray, that is measuring the duration of the old function inode_tree_add() and the duration of the new btrfs_add_inode_to_root() function, gave the following results (in nanoseconds): Before this patch, inode_tree_add() execution times: Count: 5000000 Range: 0.000 - 5536887.000; Mean: 775.674; Median: 729.000; Stddev: 4820.961 Percentiles: 90th: 1015.000; 95th: 1139.000; 99th: 1397.000 0.000 - 7.816: 40 | 7.816 - 37.858: 209 | 37.858 - 170.278: 6059 | 170.278 - 753.961: 2754890 ##################################################### 753.961 - 3326.728: 2232312 ########################################### 3326.728 - 14667.018: 4366 | 14667.018 - 64652.943: 852 | 64652.943 - 284981.761: 550 | 284981.761 - 1256150.914: 221 | 1256150.914 - 5536887.000: 7 | After this patch, btrfs_add_inode_to_root() execution times: Count: 5000000 Range: 0.000 - 2900652.000; Mean: 272.148; Median: 241.000; Stddev: 2873.369 Percentiles: 90th: 342.000; 95th: 432.000; 99th: 572.000 0.000 - 7.264: 104 | 7.264 - 33.145: 352 | 33.145 - 140.081: 109606 # 140.081 - 581.930: 4840090 ##################################################### 581.930 - 2407.590: 43532 | 2407.590 - 9950.979: 2245 | 9950.979 - 41119.278: 514 | 41119.278 - 169902.616: 155 | 169902.616 - 702018.539: 47 | 702018.539 - 2900652.000: 9 | Average, percentiles, standard deviation, etc, are all much better. Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.h7
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/inode.c128
4 files changed, 58 insertions, 86 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ed495ca7a31..1ee3f45d50a0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -165,9 +165,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /* node for the red-black tree that links inodes in subvolume root */
- struct rb_node rb_node;
-
unsigned long runtime_flags;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c03c58246033..aa2568f86dc9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -222,8 +222,11 @@ struct btrfs_root {
struct list_head root_list;
spinlock_t inode_lock;
- /* red-black tree that keeps track of in-memory inodes */
- struct rb_root inode_tree;
+ /*
+ * Xarray that keeps track of in-memory inodes, protected by the lock
+ * @inode_lock.
+ */
+ struct xarray inodes;
/*
* Xarray that keeps track of delayed nodes of every inode, protected
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38cdb8875e8e..aa85be9661cc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -662,7 +662,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
- root->inode_tree = RB_ROOT;
+ xa_init(&root->inodes);
xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -1854,7 +1854,8 @@ void btrfs_put_root(struct btrfs_root *root)
return;
if (refcount_dec_and_test(&root->refs)) {
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (WARN_ON(!xa_empty(&root->inodes)))
+ xa_destroy(&root->inodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1939,7 +1940,6 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a2b902b2d1f..aad4c9daff82 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5493,58 +5493,51 @@ out:
return err;
}
-static void inode_tree_add(struct btrfs_inode *inode)
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- struct btrfs_inode *entry;
- struct rb_node **p;
- struct rb_node *parent;
- struct rb_node *new = &inode->rb_node;
- u64 ino = btrfs_ino(inode);
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
if (inode_unhashed(&inode->vfs_inode))
- return;
- parent = NULL;
+ return 0;
+
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+
spin_lock(&root->inode_lock);
- p = &root->inode_tree.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_inode, rb_node);
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+ spin_unlock(&root->inode_lock);
- if (ino < btrfs_ino(entry))
- p = &parent->rb_left;
- else if (ino > btrfs_ino(entry))
- p = &parent->rb_right;
- else {
- WARN_ON(!(entry->vfs_inode.i_state &
- (I_WILL_FREE | I_FREEING)));
- rb_replace_node(parent, new, &root->inode_tree);
- RB_CLEAR_NODE(parent);
- spin_unlock(&root->inode_lock);
- return;
- }
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
}
- rb_link_node(new, parent, p);
- rb_insert_color(new, &root->inode_tree);
- spin_unlock(&root->inode_lock);
+
+ return 0;
}
-static void inode_tree_del(struct btrfs_inode *inode)
+static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- int empty = 0;
+ struct btrfs_inode *entry;
+ bool empty = false;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&inode->rb_node)) {
- rb_erase(&inode->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&inode->rb_node);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- }
+ entry = xa_erase(&root->inodes, btrfs_ino(inode));
+ if (entry == inode)
+ empty = xa_empty(&root->inodes);
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
spin_lock(&root->inode_lock);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
+ empty = xa_empty(&root->inodes);
spin_unlock(&root->inode_lock);
if (empty)
btrfs_add_dead_root(root);
@@ -5613,8 +5606,13 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
- inode_tree_add(BTRFS_I(inode));
- unlock_new_inode(inode);
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode));
+ if (ret) {
+ iget_failed(inode);
+ inode = ERR_PTR(ret);
+ } else {
+ unlock_new_inode(inode);
+ }
} else {
iget_failed(inode);
/*
@@ -6426,7 +6424,11 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(BTRFS_I(inode));
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -8466,7 +8468,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
- RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
@@ -8538,7 +8539,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
}
btrfs_qgroup_check_reserved_leak(inode);
- inode_tree_del(inode);
+ btrfs_del_inode_from_root(inode);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
@@ -10860,52 +10861,23 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
*/
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
{
- struct rb_node *node;
- struct rb_node *prev;
struct btrfs_inode *inode;
+ unsigned long from = min_ino;
spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- inode = rb_entry(node, struct btrfs_inode, rb_node);
- if (min_ino < btrfs_ino(inode))
- node = node->rb_left;
- else if (min_ino > btrfs_ino(inode))
- node = node->rb_right;
- else
+ while (true) {
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+ if (igrab(&inode->vfs_inode))
break;
- }
-
- if (!node) {
- while (prev) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (min_ino <= btrfs_ino(inode)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
-
- while (node) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (igrab(&inode->vfs_inode)) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- min_ino = btrfs_ino(inode) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
- node = rb_next(node);
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inode_lock);
}
spin_unlock(&root->inode_lock);
- return NULL;
+ return inode;
}
static const struct inode_operations btrfs_dir_inode_operations = {