From 3f6d810665dfde0d33785420618ceb03fba0619d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:40 -0500 Subject: libfs: Re-arrange locking in offset_iterate_dir() Liam and Matthew say that once the RCU read lock is released, xa_state is not safe to re-use for the next xas_find() call. But the RCU read lock must be released on each loop iteration so that dput(), which might_sleep(), can be called safely. Thus we are forced to walk the offset tree with fresh state for each directory entry. xa_find() can do this for us, though it might be a little less efficient than maintaining xa_state locally. We believe that in the current code base, inode->i_rwsem provides protection for the xa_state maintained in offset_iterate_dir(). However, there is no guarantee that will continue to be the case in the future. Since offset_iterate_dir() doesn't build xa_state locally any more, there's no longer a strong need for offset_find_next(). Clean up by rolling these two helpers together. Suggested-by: Liam R. Howlett Message-ID: <170785993027.11135.8830043889278631735.stgit@91.116.238.104.host.secureserver.net> Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820142021.6328.15047865406275957018.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/libfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/libfs.c') diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..752e24c669d9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -402,12 +402,13 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, offset, U32_MAX); } -static struct dentry *offset_find_next(struct xa_state *xas) +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { struct dentry *child, *found = NULL; + XA_STATE(xas, &octx->xa, offset); rcu_read_lock(); - child = xas_next_entry(xas, U32_MAX); + child = xas_next_entry(&xas, U32_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -430,12 +431,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); - XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; while (true) { - dentry = offset_find_next(&xas); + dentry = offset_find_next(octx, ctx->pos); if (!dentry) return ERR_PTR(-ENOENT); @@ -444,8 +444,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) break; } + ctx->pos = dentry2offset(dentry) + 1; dput(dentry); - ctx->pos = xas.xa_index + 1; } return NULL; } -- cgit From 7beea725a8ca412c6190090ce7c3a13b169592a1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:47 -0500 Subject: libfs: Define a minimum directory offset This value is used in several places, so make it a symbolic constant. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820142741.6328.12428356024575347885.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- fs/libfs.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs/libfs.c') diff --git a/fs/libfs.c b/fs/libfs.c index 752e24c669d9..f0045db739df 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -240,6 +240,11 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +enum { + DIR_OFFSET_MIN = 2, +}; + static void offset_set(struct dentry *dentry, u32 offset) { dentry->d_fsdata = (void *)((uintptr_t)(offset)); @@ -261,9 +266,7 @@ void simple_offset_init(struct offset_ctx *octx) { xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); - - /* 0 is '.', 1 is '..', so always start with offset 2 */ - octx->next_offset = 2; + octx->next_offset = DIR_OFFSET_MIN; } /** @@ -276,7 +279,7 @@ void simple_offset_init(struct offset_ctx *octx) */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); u32 offset; int ret; @@ -481,7 +484,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) return 0; /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == 2) + if (ctx->pos == DIR_OFFSET_MIN) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0; -- cgit From ecba88a3b32d733d41e27973e25b2bc580f64281 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:54 -0500 Subject: libfs: Add simple_offset_empty() For simple filesystems that use directory offset mapping, rely strictly on the directory offset map to tell when a directory has no children. After this patch is applied, the emptiness test holds only the RCU read lock when the directory being tested has no children. In addition, this adds another layer of confirmation that simple_offset_add/remove() are working as expected. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820143463.6328.7872919188371286951.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- fs/libfs.c | 32 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/shmem.c | 4 ++-- 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'fs/libfs.c') diff --git a/fs/libfs.c b/fs/libfs.c index f0045db739df..f7f92a49a418 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -313,6 +313,38 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) offset_set(dentry, 0); } +/** + * simple_offset_empty - Check if a dentry can be unlinked + * @dentry: dentry to be tested + * + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. + */ +int simple_offset_empty(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct offset_ctx *octx; + struct dentry *child; + unsigned long index; + int ret = 1; + + if (!inode || !S_ISDIR(inode->i_mode)) + return ret; + + index = DIR_OFFSET_MIN; + octx = inode->i_op->get_offset_ctx(inode); + xa_for_each(&octx->xa, index, child) { + spin_lock(&child->d_lock); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + ret = 0; + break; + } + spin_unlock(&child->d_lock); + } + + return ret; +} + /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..03d141809a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3267,6 +3267,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_empty(struct dentry *dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/mm/shmem.c b/mm/shmem.c index d7c84ff62186..6fed524343cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3374,7 +3374,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_empty(dentry)) + if (!simple_offset_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3431,7 +3431,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_empty(new_dentry)) + if (!simple_offset_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { -- cgit From 0e4a862174f2a8d1653a8a9cf0815020e1d3af24 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:16 -0500 Subject: libfs: Convert simple directory offsets to use a Maple Tree Test robot reports: > kernel test robot noticed a -19.0% regression of aim9.disk_src.ops_per_sec on: > > commit: a2e459555c5f9da3e619b7e47a63f98574dc75f1 ("shmem: stable directory offsets") > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master Feng Tang further clarifies that: > ... the new simple_offset_add() > called by shmem_mknod() brings extra cost related with slab, > specifically the 'radix_tree_node', which cause the regression. Willy's analysis is that, over time, the test workload causes xa_alloc_cyclic() to fragment the underlying SLAB cache. This patch replaces the offset_ctx's xarray with a Maple Tree in the hope that Maple Tree's dense node mode will handle this scenario more scalably. In addition, we can widen the simple directory offset maximum to signed long (as loff_t is also signed). Suggested-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309081306.3ecb3734-oliver.sang@intel.com Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820145616.6328.12620992971699079156.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/libfs.c | 47 +++++++++++++++++++++++------------------------ include/linux/fs.h | 5 +++-- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'fs/libfs.c') diff --git a/fs/libfs.c b/fs/libfs.c index f7f92a49a418..d3d31197c8e4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,17 +245,17 @@ enum { DIR_OFFSET_MIN = 2, }; -static void offset_set(struct dentry *dentry, u32 offset) +static void offset_set(struct dentry *dentry, long offset) { - dentry->d_fsdata = (void *)((uintptr_t)(offset)); + dentry->d_fsdata = (void *)offset; } -static u32 dentry2offset(struct dentry *dentry) +static long dentry2offset(struct dentry *dentry) { - return (u32)((uintptr_t)(dentry->d_fsdata)); + return (long)dentry->d_fsdata; } -static struct lock_class_key simple_offset_xa_lock; +static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx @@ -264,8 +264,8 @@ static struct lock_class_key simple_offset_xa_lock; */ void simple_offset_init(struct offset_ctx *octx) { - xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); + lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; } @@ -274,20 +274,19 @@ void simple_offset_init(struct offset_ctx *octx) * @octx: directory offset ctx to be updated * @dentry: new dentry being added * - * Returns zero on success. @so_ctx and the dentry offset are updated. + * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); - u32 offset; + unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; - ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, - &octx->next_offset, GFP_KERNEL); + ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, + LONG_MAX, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret; @@ -303,13 +302,13 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { - u32 offset; + long offset; offset = dentry2offset(dentry); if (offset == 0) return; - xa_erase(&octx->xa, offset); + mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } @@ -332,7 +331,7 @@ int simple_offset_empty(struct dentry *dentry) index = DIR_OFFSET_MIN; octx = inode->i_op->get_offset_ctx(inode); - xa_for_each(&octx->xa, index, child) { + mt_for_each(&octx->mt, child, index, LONG_MAX) { spin_lock(&child->d_lock); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -362,8 +361,8 @@ int simple_offset_rename_exchange(struct inode *old_dir, { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); - u32 old_index = dentry2offset(old_dentry); - u32 new_index = dentry2offset(new_dentry); + long old_index = dentry2offset(old_dentry); + long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); @@ -389,9 +388,9 @@ int simple_offset_rename_exchange(struct inode *old_dir, out_restore: offset_set(old_dentry, old_index); - xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); - xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); return ret; } @@ -404,7 +403,7 @@ out_restore: */ void simple_offset_destroy(struct offset_ctx *octx) { - xa_destroy(&octx->xa); + mtree_destroy(&octx->mt); } /** @@ -434,16 +433,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) /* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; - return vfs_setpos(file, offset, U32_MAX); + return vfs_setpos(file, offset, LONG_MAX); } static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { + MA_STATE(mas, &octx->mt, offset, offset); struct dentry *child, *found = NULL; - XA_STATE(xas, &octx->xa, offset); rcu_read_lock(); - child = xas_next_entry(&xas, U32_MAX); + child = mas_find(&mas, LONG_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -457,8 +456,8 @@ out: static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { - u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); + long offset = dentry2offset(dentry); return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 03d141809a2c..55144c12ee0f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -3260,8 +3261,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { - struct xarray xa; - u32 next_offset; + struct maple_tree mt; + unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); -- cgit