From 6f692b1672bdd279832a1c5227afb58fbbfbd0be Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 9 Jul 2024 16:43:01 -0400
Subject: bcachefs: Fix RCU splat

Reported-by: syzbot+e74fea078710bbca6f4b@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 fs/bcachefs/buckets.h | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 743d57eba760..314ee3e0187f 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -805,7 +805,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
 			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
 			"while marking %s",
 			ptr->dev, bucket_nr, b_gen,
-			*bucket_gen(ca, bucket_nr),
+			bucket_gen_get(ca, bucket_nr),
 			bch2_data_type_str(bucket_data_type ?: ptr_data_type),
 			ptr->gen,
 			(printbuf_reset(&buf),
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 80ee0be9793e..8ad4be73860c 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -116,6 +116,14 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
 	return gens->b + b;
 }
 
+static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b)
+{
+	rcu_read_lock();
+	u8 gen = *bucket_gen(ca, b);
+	rcu_read_unlock();
+	return gen;
+}
+
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 				   const struct bch_extent_ptr *ptr)
 {
-- 
cgit 


From fd80d14005250652e7500a97eedd68797de0b797 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 10 Jul 2024 12:59:28 -0400
Subject: bcachefs: fix scheduling while atomic in break_cycle()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.c |  2 +-
 fs/bcachefs/util.c          | 25 ++++++++++++++++++++++---
 fs/bcachefs/util.h          |  1 +
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index d66fff22109a..191d9a50378a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -231,7 +231,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
 			prt_newline(&buf);
 		}
 
-		bch2_print_string_as_lines(KERN_ERR, buf.buf);
+		bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
 		printbuf_exit(&buf);
 		BUG();
 	}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index de331dec2a99..4ec7e44d6e36 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
 	bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
 }
 
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
+static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
+					 bool nonblocking)
 {
+	bool locked = false;
 	const char *p;
 
 	if (!lines) {
@@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 		return;
 	}
 
-	console_lock();
+	if (!nonblocking) {
+		console_lock();
+		locked = true;
+	} else {
+		locked = console_trylock();
+	}
+
 	while (1) {
 		p = strchrnul(lines, '\n');
 		printk("%s%.*s\n", prefix, (int) (p - lines), lines);
@@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 			break;
 		lines = p + 1;
 	}
-	console_unlock();
+	if (locked)
+		console_unlock();
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+	return __bch2_print_string_as_lines(prefix, lines, false);
+}
+
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
+{
+	return __bch2_print_string_as_lines(prefix, lines, true);
 }
 
 int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 5d2c470a49ac..5b0533ec4c7e 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -315,6 +315,7 @@ void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
 void bch2_prt_u64_base2(struct printbuf *, u64);
 
 void bch2_print_string_as_lines(const char *prefix, const char *lines);
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
 
 typedef DARRAY(unsigned long) bch_stacktrace;
 int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
-- 
cgit 


From 68a3ebd18bc8c4e766250d5c43d30ff75b8aec0b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 3 Jul 2024 11:07:26 +0100
Subject: btrfs: use delayed iput during extent map shrinking

When putting an inode during extent map shrinking we're doing a standard
iput() but that may take a long time in case the inode is dirty and we are
doing the final iput that triggers eviction - the VFS will have to wait
for writeback before calling the btrfs evict callback (see
fs/inode.c:evict()).

This slows down the task running the shrinker which may have been
triggered while updating some tree for example, meaning locks are held
as well as an open transaction handle.

Also if the iput() ends up triggering eviction and the inode has no links
anymore, then we trigger item truncation which requires flushing delayed
items, space reservation to start a transaction and that may trigger the
space reclaim task and wait for it, resulting in deadlocks in case the
reclaim task needs for example to commit a transaction and the shrinker
is being triggered from a path holding a transaction handle.

Syzbot reported such a case with the following stack traces:

   ======================================================
   WARNING: possible circular locking dependency detected
   6.10.0-rc2-syzkaller-00010-g2ab795141095 #0 Not tainted
   ------------------------------------------------------
   kswapd0/111 is trying to acquire lock:
   ffff88801eae4610 (sb_internal#3){.+.+}-{0:0}, at: btrfs_commit_inode_delayed_inode+0x110/0x330 fs/btrfs/delayed-inode.c:1275

   but task is already holding lock:
   ffffffff8dd3a9a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xa88/0x1970 mm/vmscan.c:6924

   which lock already depends on the new lock.

   the existing dependency chain (in reverse order) is:

   -> #3 (fs_reclaim){+.+.}-{0:0}:
          __fs_reclaim_acquire mm/page_alloc.c:3783 [inline]
          fs_reclaim_acquire+0x102/0x160 mm/page_alloc.c:3797
          might_alloc include/linux/sched/mm.h:334 [inline]
          slab_pre_alloc_hook mm/slub.c:3890 [inline]
          slab_alloc_node mm/slub.c:3980 [inline]
          kmem_cache_alloc_lru_noprof+0x58/0x2f0 mm/slub.c:4019
          btrfs_alloc_inode+0x118/0xb20 fs/btrfs/inode.c:8411
          alloc_inode+0x5d/0x230 fs/inode.c:261
          iget5_locked fs/inode.c:1235 [inline]
          iget5_locked+0x1c9/0x2c0 fs/inode.c:1228
          btrfs_iget_locked fs/btrfs/inode.c:5590 [inline]
          btrfs_iget_path fs/btrfs/inode.c:5607 [inline]
          btrfs_iget+0xfb/0x230 fs/btrfs/inode.c:5636
          create_reloc_inode+0x403/0x820 fs/btrfs/relocation.c:3911
          btrfs_relocate_block_group+0x471/0xe60 fs/btrfs/relocation.c:4114
          btrfs_relocate_chunk+0x143/0x450 fs/btrfs/volumes.c:3373
          __btrfs_balance fs/btrfs/volumes.c:4157 [inline]
          btrfs_balance+0x211a/0x3f00 fs/btrfs/volumes.c:4534
          btrfs_ioctl_balance fs/btrfs/ioctl.c:3675 [inline]
          btrfs_ioctl+0x12ed/0x8290 fs/btrfs/ioctl.c:4742
          __do_compat_sys_ioctl+0x2c3/0x330 fs/ioctl.c:1007
          do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline]
          __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386
          do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411
          entry_SYSENTER_compat_after_hwframe+0x84/0x8e

   -> #2 (btrfs_trans_num_extwriters){++++}-{0:0}:
          join_transaction+0x164/0xf40 fs/btrfs/transaction.c:315
          start_transaction+0x427/0x1a70 fs/btrfs/transaction.c:700
          btrfs_rebuild_free_space_tree+0xaa/0x480 fs/btrfs/free-space-tree.c:1323
          btrfs_start_pre_rw_mount+0x218/0xf60 fs/btrfs/disk-io.c:2999
          open_ctree+0x41ab/0x52e0 fs/btrfs/disk-io.c:3554
          btrfs_fill_super fs/btrfs/super.c:946 [inline]
          btrfs_get_tree_super fs/btrfs/super.c:1863 [inline]
          btrfs_get_tree+0x11e9/0x1b90 fs/btrfs/super.c:2089
          vfs_get_tree+0x8f/0x380 fs/super.c:1780
          fc_mount+0x16/0xc0 fs/namespace.c:1125
          btrfs_get_tree_subvol fs/btrfs/super.c:2052 [inline]
          btrfs_get_tree+0xa53/0x1b90 fs/btrfs/super.c:2090
          vfs_get_tree+0x8f/0x380 fs/super.c:1780
          do_new_mount fs/namespace.c:3352 [inline]
          path_mount+0x6e1/0x1f10 fs/namespace.c:3679
          do_mount fs/namespace.c:3692 [inline]
          __do_sys_mount fs/namespace.c:3898 [inline]
          __se_sys_mount fs/namespace.c:3875 [inline]
          __ia32_sys_mount+0x295/0x320 fs/namespace.c:3875
          do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline]
          __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386
          do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411
          entry_SYSENTER_compat_after_hwframe+0x84/0x8e

   -> #1 (btrfs_trans_num_writers){++++}-{0:0}:
          join_transaction+0x148/0xf40 fs/btrfs/transaction.c:314
          start_transaction+0x427/0x1a70 fs/btrfs/transaction.c:700
          btrfs_rebuild_free_space_tree+0xaa/0x480 fs/btrfs/free-space-tree.c:1323
          btrfs_start_pre_rw_mount+0x218/0xf60 fs/btrfs/disk-io.c:2999
          open_ctree+0x41ab/0x52e0 fs/btrfs/disk-io.c:3554
          btrfs_fill_super fs/btrfs/super.c:946 [inline]
          btrfs_get_tree_super fs/btrfs/super.c:1863 [inline]
          btrfs_get_tree+0x11e9/0x1b90 fs/btrfs/super.c:2089
          vfs_get_tree+0x8f/0x380 fs/super.c:1780
          fc_mount+0x16/0xc0 fs/namespace.c:1125
          btrfs_get_tree_subvol fs/btrfs/super.c:2052 [inline]
          btrfs_get_tree+0xa53/0x1b90 fs/btrfs/super.c:2090
          vfs_get_tree+0x8f/0x380 fs/super.c:1780
          do_new_mount fs/namespace.c:3352 [inline]
          path_mount+0x6e1/0x1f10 fs/namespace.c:3679
          do_mount fs/namespace.c:3692 [inline]
          __do_sys_mount fs/namespace.c:3898 [inline]
          __se_sys_mount fs/namespace.c:3875 [inline]
          __ia32_sys_mount+0x295/0x320 fs/namespace.c:3875
          do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline]
          __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386
          do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411
          entry_SYSENTER_compat_after_hwframe+0x84/0x8e

   -> #0 (sb_internal#3){.+.+}-{0:0}:
          check_prev_add kernel/locking/lockdep.c:3134 [inline]
          check_prevs_add kernel/locking/lockdep.c:3253 [inline]
          validate_chain kernel/locking/lockdep.c:3869 [inline]
          __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137
          lock_acquire kernel/locking/lockdep.c:5754 [inline]
          lock_acquire+0x1b1/0x560 kernel/locking/lockdep.c:5719
          percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
          __sb_start_write include/linux/fs.h:1655 [inline]
          sb_start_intwrite include/linux/fs.h:1838 [inline]
          start_transaction+0xbc1/0x1a70 fs/btrfs/transaction.c:694
          btrfs_commit_inode_delayed_inode+0x110/0x330 fs/btrfs/delayed-inode.c:1275
          btrfs_evict_inode+0x960/0xe80 fs/btrfs/inode.c:5291
          evict+0x2ed/0x6c0 fs/inode.c:667
          iput_final fs/inode.c:1741 [inline]
          iput.part.0+0x5a8/0x7f0 fs/inode.c:1767
          iput+0x5c/0x80 fs/inode.c:1757
          btrfs_scan_root fs/btrfs/extent_map.c:1118 [inline]
          btrfs_free_extent_maps+0xbd3/0x1320 fs/btrfs/extent_map.c:1189
          super_cache_scan+0x409/0x550 fs/super.c:227
          do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435
          shrink_slab+0x18a/0x1310 mm/shrinker.c:662
          shrink_one+0x493/0x7c0 mm/vmscan.c:4790
          shrink_many mm/vmscan.c:4851 [inline]
          lru_gen_shrink_node+0x89f/0x1750 mm/vmscan.c:4951
          shrink_node mm/vmscan.c:5910 [inline]
          kswapd_shrink_node mm/vmscan.c:6720 [inline]
          balance_pgdat+0x1105/0x1970 mm/vmscan.c:6911
          kswapd+0x5ea/0xbf0 mm/vmscan.c:7180
          kthread+0x2c1/0x3a0 kernel/kthread.c:389
          ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147
          ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244

   other info that might help us debug this:

   Chain exists of:
     sb_internal#3 --> btrfs_trans_num_extwriters --> fs_reclaim

    Possible unsafe locking scenario:

          CPU0                    CPU1
          ----                    ----
     lock(fs_reclaim);
                                  lock(btrfs_trans_num_extwriters);
                                  lock(fs_reclaim);
     rlock(sb_internal#3);

    *** DEADLOCK ***

   2 locks held by kswapd0/111:
    #0: ffffffff8dd3a9a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0xa88/0x1970 mm/vmscan.c:6924
    #1: ffff88801eae40e0 (&type->s_umount_key#62){++++}-{3:3}, at: super_trylock_shared fs/super.c:562 [inline]
    #1: ffff88801eae40e0 (&type->s_umount_key#62){++++}-{3:3}, at: super_cache_scan+0x96/0x550 fs/super.c:196

   stack backtrace:
   CPU: 0 PID: 111 Comm: kswapd0 Not tainted 6.10.0-rc2-syzkaller-00010-g2ab795141095 #0
   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
   Call Trace:
    <TASK>
    __dump_stack lib/dump_stack.c:88 [inline]
    dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114
    check_noncircular+0x31a/0x400 kernel/locking/lockdep.c:2187
    check_prev_add kernel/locking/lockdep.c:3134 [inline]
    check_prevs_add kernel/locking/lockdep.c:3253 [inline]
    validate_chain kernel/locking/lockdep.c:3869 [inline]
    __lock_acquire+0x2478/0x3b30 kernel/locking/lockdep.c:5137
    lock_acquire kernel/locking/lockdep.c:5754 [inline]
    lock_acquire+0x1b1/0x560 kernel/locking/lockdep.c:5719
    percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
    __sb_start_write include/linux/fs.h:1655 [inline]
    sb_start_intwrite include/linux/fs.h:1838 [inline]
    start_transaction+0xbc1/0x1a70 fs/btrfs/transaction.c:694
    btrfs_commit_inode_delayed_inode+0x110/0x330 fs/btrfs/delayed-inode.c:1275
    btrfs_evict_inode+0x960/0xe80 fs/btrfs/inode.c:5291
    evict+0x2ed/0x6c0 fs/inode.c:667
    iput_final fs/inode.c:1741 [inline]
    iput.part.0+0x5a8/0x7f0 fs/inode.c:1767
    iput+0x5c/0x80 fs/inode.c:1757
    btrfs_scan_root fs/btrfs/extent_map.c:1118 [inline]
    btrfs_free_extent_maps+0xbd3/0x1320 fs/btrfs/extent_map.c:1189
    super_cache_scan+0x409/0x550 fs/super.c:227
    do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435
    shrink_slab+0x18a/0x1310 mm/shrinker.c:662
    shrink_one+0x493/0x7c0 mm/vmscan.c:4790
    shrink_many mm/vmscan.c:4851 [inline]
    lru_gen_shrink_node+0x89f/0x1750 mm/vmscan.c:4951
    shrink_node mm/vmscan.c:5910 [inline]
    kswapd_shrink_node mm/vmscan.c:6720 [inline]
    balance_pgdat+0x1105/0x1970 mm/vmscan.c:6911
    kswapd+0x5ea/0xbf0 mm/vmscan.c:7180
    kthread+0x2c1/0x3a0 kernel/kthread.c:389
    ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147
    ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
    </TASK>

So fix this by using btrfs_add_delayed_iput() so that the final iput is
delegated to the cleaner kthread.

Link: https://lore.kernel.org/linux-btrfs/000000000000892280061a344581@google.com/
Reported-by: syzbot+3dad89b3993a4b275e72@syzkaller.appspotmail.com
Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..cb74d382a24f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1115,7 +1115,7 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 
 		min_ino = btrfs_ino(inode) + 1;
 		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
-		iput(&inode->vfs_inode);
+		btrfs_add_delayed_iput(inode);
 
 		if (*scanned >= nr_to_scan)
 			break;
-- 
cgit 


From b3ebb9b7e92a928344a7a2c1f8514474bfa113cf Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 8 Jul 2024 15:42:44 +0100
Subject: btrfs: stop extent map shrinker if reschedule is needed

The extent map shrinker can be called in a variety of contexts where we
are under memory pressure, and of them is when a task is trying to
allocate memory. For this reason the shrinker is typically called with a
value of struct shrink_control::nr_to_scan that is much smaller than what
we return in the nr_cached_objects callback of struct super_operations
(fs/btrfs/super.c:btrfs_nr_cached_objects()), so that the shrinker does
not take a long time and cause high latencies. However we can still take
a lot of time in the shrinker even for a limited amount of nr_to_scan:

1) When traversing the red black tree that tracks open inodes in a root,
   as for example with millions of open inodes we get a deep tree which
   takes time searching for an inode;

2) Iterating over the extent map tree, which is a red black tree, of an
   inode when doing the rb_next() calls and when removing an extent map
   from the tree, since often that requires rebalancing the red black
   tree;

3) When trying to write lock an inode's extent map tree we may wait for a
   significant amount of time, because there's either another task about
   to do IO and searching for an extent map in the tree or inserting an
   extent map in the tree, and we can have thousands or even millions of
   extent maps for an inode. Furthermore, there can be concurrent calls
   to the shrinker so the lock might be busy simply because there is
   already another task shrinking extent maps for the same inode;

4) We often reschedule if we need to, which further increases latency.

So improve on this by stopping the extent map shrinking code whenever we
need to reschedule and make it skip an inode if we can't immediately lock
its extent map tree.

Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Reported-by: Andrea Gelmini <andrea.gelmini@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CABXGCsMmmb36ym8hVNGTiU8yfUS_cGvoUmGCcBrGWq9OxTrs+A@mail.gmail.com/
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index cb74d382a24f..887a0e5dc145 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1057,7 +1057,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 	if (!down_read_trylock(&inode->i_mmap_lock))
 		return 0;
 
-	write_lock(&tree->lock);
+	/*
+	 * We want to be fast because we can be called from any path trying to
+	 * allocate memory, so if the lock is busy we don't want to spend time
+	 * waiting for it - either some task is about to do IO for the inode or
+	 * we may have another task shrinking extent maps, here in this code, so
+	 * skip this inode.
+	 */
+	if (!write_trylock(&tree->lock)) {
+		up_read(&inode->i_mmap_lock);
+		return 0;
+	}
+
 	node = rb_first_cached(&tree->map);
 	while (node) {
 		struct extent_map *em;
@@ -1089,12 +1100,14 @@ next:
 			break;
 
 		/*
-		 * Restart if we had to reschedule, and any extent maps that were
-		 * pinned before may have become unpinned after we released the
-		 * lock and took it again.
+		 * Stop if we need to reschedule or there's contention on the
+		 * lock. This is to avoid slowing other tasks trying to take the
+		 * lock and because the shrinker might be called during a memory
+		 * allocation path and we want to avoid taking a very long time
+		 * and slowing down all sorts of tasks.
 		 */
-		if (cond_resched_rwlock_write(&tree->lock))
-			node = rb_first_cached(&tree->map);
+		if (need_resched() || rwlock_needbreak(&tree->lock))
+			break;
 	}
 	write_unlock(&tree->lock);
 	up_read(&inode->i_mmap_lock);
@@ -1120,7 +1133,13 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 		if (*scanned >= nr_to_scan)
 			break;
 
-		cond_resched();
+		/*
+		 * We may be called from memory allocation paths, so we don't
+		 * want to take too much time and slowdown tasks.
+		 */
+		if (need_resched())
+			break;
+
 		inode = btrfs_find_first_inode(root, min_ino);
 	}
 
@@ -1159,7 +1178,11 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
 	}
 
-	while (scanned < nr_to_scan) {
+	/*
+	 * We may be called from memory allocation paths, so we don't want to
+	 * take too much time and slowdown tasks, so stop if we need reschedule.
+	 */
+	while (scanned < nr_to_scan && !need_resched()) {
 		struct btrfs_root *root;
 		unsigned long count;
 
-- 
cgit 


From 4484940514295b75389f94787f8e179ba6255353 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 8 Jul 2024 15:42:45 +0100
Subject: btrfs: avoid races when tracking progress for extent map shrinking

We store the progress (root and inode numbers) of the extent map shrinker
in fs_info without any synchronization but we can have multiple tasks
calling into the shrinker during memory allocations when there's enough
memory pressure for example.

This can result in a task A reading fs_info->extent_map_shrinker_last_ino
after another task B updates it, and task A reading
fs_info->extent_map_shrinker_last_root before task B updates it, making
task A see an odd state that isn't necessarily harmful but may make it
skip certain inode ranges or do more work than necessary by going over
the same inodes again. These unprotected accesses would also trigger
warnings from tools like KCSAN.

So add a lock to protect access to these progress fields.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c           |  2 ++
 fs/btrfs/extent_map.c        | 84 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/fs.h                |  1 +
 include/trace/events/btrfs.h | 18 +++++-----
 4 files changed, 76 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 242ada7e47b4..1d93fb02fce2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	if (ret)
 		return ret;
 
+	spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
 	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 887a0e5dc145..b4c9a6aa118c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1028,7 +1028,14 @@ out_free_pre:
 	return ret;
 }
 
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+	long nr_to_scan;
+	long scanned;
+	u64 last_ino;
+	u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
 {
 	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
 	struct extent_map_tree *tree = &inode->extent_tree;
@@ -1075,7 +1082,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 
 		em = rb_entry(node, struct extent_map, rb_node);
 		node = rb_next(node);
-		(*scanned)++;
+		ctx->scanned++;
 
 		if (em->flags & EXTENT_FLAG_PINNED)
 			goto next;
@@ -1096,7 +1103,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 		free_extent_map(em);
 		nr_dropped++;
 next:
-		if (*scanned >= nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan)
 			break;
 
 		/*
@@ -1115,22 +1122,21 @@ next:
 	return nr_dropped;
 }
 
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+	u64 min_ino = ctx->last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
-		nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
+		ctx->last_ino = btrfs_ino(inode);
 		btrfs_add_delayed_iput(inode);
 
-		if (*scanned >= nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan)
 			break;
 
 		/*
@@ -1151,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+		ctx->last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		fs_info->extent_map_shrinker_last_ino = 0;
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+		ctx->last_ino = 0;
+		ctx->last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1166,23 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 
 long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 {
-	const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
-	u64 next_root_id = start_root_id;
+	struct btrfs_em_shrink_ctx ctx;
+	u64 start_root_id;
+	u64 next_root_id;
 	bool cycled = false;
 	long nr_dropped = 0;
-	long scanned = 0;
+
+	ctx.scanned = 0;
+	ctx.nr_to_scan = nr_to_scan;
+
+	/*
+	 * In case we have multiple tasks running this shrinker, make the next
+	 * one start from the next inode in case it starts before we finish.
+	 */
+	spin_lock(&fs_info->extent_map_shrinker_lock);
+	ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+	fs_info->extent_map_shrinker_last_ino++;
+	ctx.last_root = fs_info->extent_map_shrinker_last_root;
+	spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+	start_root_id = ctx.last_root;
+	next_root_id = ctx.last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+							   nr, ctx.last_root,
+							   ctx.last_ino);
 	}
 
 	/*
 	 * We may be called from memory allocation paths, so we don't want to
 	 * take too much time and slowdown tasks, so stop if we need reschedule.
 	 */
-	while (scanned < nr_to_scan && !need_resched()) {
+	while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
 		struct btrfs_root *root;
 		unsigned long count;
 
@@ -1194,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				fs_info->extent_map_shrinker_last_root = 0;
-				fs_info->extent_map_shrinker_last_ino = 0;
+				ctx.last_root = 0;
+				ctx.last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1209,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 			continue;
 
 		if (is_fstree(btrfs_root_id(root)))
-			nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+			nr_dropped += btrfs_scan_root(root, &ctx);
 
 		btrfs_put_root(root);
 	}
 
+	/*
+	 * In case of multiple tasks running this extent map shrinking code this
+	 * isn't perfect but it's simple and silences things like KCSAN. It's
+	 * not possible to know which task made more progress because we can
+	 * cycle back to the first root and first inode if it's not the first
+	 * time the shrinker ran, see the above logic. Also a task that started
+	 * later may finish ealier than another task and made less progress. So
+	 * make this simple and update to the progress of the last task that
+	 * finished, with the occasional possiblity of having two consecutive
+	 * runs of the shrinker process the same inodes.
+	 */
+	spin_lock(&fs_info->extent_map_shrinker_lock);
+	fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+	fs_info->extent_map_shrinker_last_root = ctx.last_root;
+	spin_unlock(&fs_info->extent_map_shrinker_lock);
+
 	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+							  nr, ctx.last_root,
+							  ctx.last_ino);
 	}
 
 	return nr_dropped;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..833dc3fe0a38 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
+	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d2d94d7c3fb5..0067e8443d38 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
+		 u64 last_root_id, u64 last_ino),
 
-	TP_ARGS(fs_info, nr_to_scan, nr),
+	TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_to_scan	)
@@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_to_scan	= nr_to_scan;
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= last_root_id;
+		__entry->last_ino	= last_ino;
 	),
 
 	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
 
 TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 
-	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
+		 u64 last_root_id, u64 last_ino),
 
-	TP_ARGS(fs_info, nr_dropped, nr),
+	TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	long,	nr_dropped	)
@@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->nr_dropped	= nr_dropped;
 		__entry->nr		= nr;
-		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
-		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+		__entry->last_root_id	= last_root_id;
+		__entry->last_ino	= last_ino;
 	),
 
 	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
-- 
cgit 


From aacd897d4d75adaae3becc2b00d8cdc9a56928d1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 20:01:31 -0400
Subject: Revert "bcachefs: Mark bch_inode_info as SLAB_ACCOUNT"

This reverts commit 86d81ec5f5f05846c7c6e48ffb964b24cba2e669.

This wasn't tested with memcg enabled, it immediately hits a null ptr
deref in list_lru_add().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 74a1e12a7a14..fa1fee05cf8f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2073,8 +2073,7 @@ int __init bch2_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
-				      SLAB_ACCOUNT);
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
 	if (!bch2_inode_cache)
 		goto err;
 
-- 
cgit 


From f0f3e5114871330faf3248e92ceaa4f18602e941 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 11 Jul 2024 16:14:11 -0400
Subject: bcachefs; Use trans_unlock_long() when waiting on allocator

not using unlock_long() blocks key cache reclaim, and the allocator may
take awhile

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 4ec979b4b23e..4583c9386e8c 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -125,7 +125,7 @@ err_noprint:
 	bch2_bkey_buf_exit(&old, c);
 
 	if (closure_nr_remaining(&cl) != 1) {
-		bch2_trans_unlock(trans);
+		bch2_trans_unlock_long(trans);
 		closure_sync(&cl);
 	}
 
-- 
cgit 


From f236ea4bca00ce457e165403a50a8938dced9623 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 3 Jul 2024 20:35:36 -0400
Subject: bcachefs: Set PF_MEMALLOC_NOFS when trans->locked

proper lock ordering is: fs_reclaim -> btree node locks

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c    |  7 ++++---
 fs/bcachefs/btree_locking.c |  8 +++-----
 fs/bcachefs/btree_locking.h | 22 ++++++++++++++++++++++
 fs/bcachefs/btree_types.h   |  1 +
 4 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0ed9e6574fcd..19352a08ea20 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -996,7 +996,7 @@ retry_all:
 
 	bch2_trans_unlock(trans);
 	cond_resched();
-	trans->locked = true;
+	trans_set_locked(trans);
 
 	if (unlikely(trans->memory_allocation_failure)) {
 		struct closure cl;
@@ -3089,7 +3089,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
 		bch2_trans_srcu_unlock(trans);
 
 	trans->last_begin_ip = _RET_IP_;
-	trans->locked  = true;
+
+	trans_set_locked(trans);
 
 	if (trans->restarted) {
 		bch2_btree_path_traverse_all(trans);
@@ -3159,7 +3160,6 @@ got_trans:
 	trans->last_begin_time	= local_clock();
 	trans->fn_idx		= fn_idx;
 	trans->locking_wait.task = current;
-	trans->locked		= true;
 	trans->journal_replay_not_finished =
 		unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
 		atomic_inc_not_zero(&c->journal_keys.ref);
@@ -3193,6 +3193,7 @@ got_trans:
 	trans->srcu_idx		= srcu_read_lock(&c->btree_trans_barrier);
 	trans->srcu_lock_time	= jiffies;
 	trans->srcu_held	= true;
+	trans_set_locked(trans);
 
 	closure_init_stack_release(&trans->ref);
 	return trans;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 191d9a50378a..c51826fd557f 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -792,7 +792,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
 			return bch2_trans_relock_fail(trans, path, &f, trace);
 	}
 
-	trans->locked = true;
+	trans_set_locked(trans);
 out:
 	bch2_trans_verify_locks(trans);
 	return 0;
@@ -812,16 +812,14 @@ void bch2_trans_unlock_noassert(struct btree_trans *trans)
 {
 	__bch2_trans_unlock(trans);
 
-	trans->locked = false;
-	trans->last_unlock_ip = _RET_IP_;
+	trans_set_unlocked(trans);
 }
 
 void bch2_trans_unlock(struct btree_trans *trans)
 {
 	__bch2_trans_unlock(trans);
 
-	trans->locked = false;
-	trans->last_unlock_ip = _RET_IP_;
+	trans_set_unlocked(trans);
 }
 
 void bch2_trans_unlock_long(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7f41545b9147..75a6274c7d27 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -193,6 +193,28 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
 
 /* lock: */
 
+static inline void trans_set_locked(struct btree_trans *trans)
+{
+	if (!trans->locked) {
+		trans->locked = true;
+		trans->last_unlock_ip = 0;
+
+		trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
+		current->flags |= PF_MEMALLOC_NOFS;
+	}
+}
+
+static inline void trans_set_unlocked(struct btree_trans *trans)
+{
+	if (trans->locked) {
+		trans->locked = false;
+		trans->last_unlock_ip = _RET_IP_;
+
+		if (!trans->pf_memalloc_nofs)
+			current->flags &= ~PF_MEMALLOC_NOFS;
+	}
+}
+
 static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 					 struct btree_bkey_cached_common *b,
 					 enum six_lock_type type,
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 87f485e9c552..48cb1a7d31c5 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -484,6 +484,7 @@ struct btree_trans {
 	bool			lock_may_not_fail:1;
 	bool			srcu_held:1;
 	bool			locked:1;
+	bool			pf_memalloc_nofs:1;
 	bool			write_locked:1;
 	bool			used_mempool:1;
 	bool			in_traverse_all:1;
-- 
cgit 


From 1841027c7de47527ed819a935b7aa340b9171eb5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 4 Jul 2024 21:02:16 -0400
Subject: bcachefs: bch2_gc_btree() should not use btree_root_lock

btree_root_lock is for the root keys in btree_root, not the pointers to
the nodes themselves; this fixes a lock ordering issue between
btree_root_lock and btree node locks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 7c72a9e6f511..a0deb8266011 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -641,16 +641,30 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
 		target_depth = 0;
 
 	/* root */
-	mutex_lock(&c->btree_root_lock);
-	struct btree *b = bch2_btree_id_root(c, btree)->b;
-	if (!btree_node_fake(b)) {
+	do {
+retry_root:
+		bch2_trans_begin(trans);
+
+		struct btree_iter iter;
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
+					  0, bch2_btree_id_root(c, btree)->b->c.level, 0);
+		struct btree *b = bch2_btree_iter_peek_node(&iter);
+		ret = PTR_ERR_OR_ZERO(b);
+		if (ret)
+			goto err_root;
+
+		if (b != btree_node_root(c, b)) {
+			bch2_trans_iter_exit(trans, &iter);
+			goto retry_root;
+		}
+
 		gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
-		ret = lockrestart_do(trans,
-			bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
-					 NULL, NULL, bkey_i_to_s_c(&b->key), initial));
+		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+		ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
 		level = b->c.level;
-	}
-	mutex_unlock(&c->btree_root_lock);
+err_root:
+		bch2_trans_iter_exit(trans, &iter);
+	} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
 	if (ret)
 		return ret;
-- 
cgit