10 files changed, 148 insertions, 42 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1a66be33bb04..60066822b532 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1924,8 +1924,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
 next:
 		if (ret) {
 			/* Refcount held by the reclaim_bgs list after splice. */
-			btrfs_get_block_group(bg);
-			list_add_tail(&bg->bg_list, &retry_list);
+			spin_lock(&fs_info->unused_bgs_lock);
+			/*
+			 * This block group might be added to the unused list
+			 * during the above process. Move it back to the
+			 * reclaim list otherwise.
+			 */
+			if (list_empty(&bg->bg_list)) {
+				btrfs_get_block_group(bg);
+				list_add_tail(&bg->bg_list, &retry_list);
+			}
+			spin_unlock(&fs_info->unused_bgs_lock);
 		}
 		btrfs_put_block_group(bg);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38cdb8875e8e..cabb558dbdaa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
 	if (ret)
 		return ret;
 
+	spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
 	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f688fab55251..958155cc43a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3553,7 +3553,7 @@ err:
 	for (int i = 0; i < num_folios; i++) {
 		if (eb->folios[i]) {
 			detach_extent_buffer_folio(eb, eb->folios[i]);
-			__folio_put(eb->folios[i]);
+			folio_put(eb->folios[i]);
 		}
 	}
 	__free_extent_buffer(eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..b4c9a6aa118c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1028,7 +1028,14 @@ out_free_pre:
 	return ret;
 }
 
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+	long nr_to_scan;
+	long scanned;
+	u64 last_ino;
+	u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
 {
 	const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
 	struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 	if (!down_read_trylock(&inode->i_mmap_lock))
 		return 0;
 
-	write_lock(&tree->lock);
+	/*
+	 * We want to be fast because we can be called from any path trying to
+	 * allocate memory, so if the lock is busy we don't want to spend time
+	 * waiting for it - either some task is about to do IO for the inode or
+	 * we may have another task shrinking extent maps, here in this code, so
+	 * skip this inode.
+	 */
+	if (!write_trylock(&tree->lock)) {
+		up_read(&inode->i_mmap_lock);
+		return 0;
+	}
+
 	node = rb_first_cached(&tree->map);
 	while (node) {
 		struct extent_map *em;
 
 		em = rb_entry(node, struct extent_map, rb_node);
 		node = rb_next(node);
-		(*scanned)++;
+		ctx->scanned++;
 
 		if (em->flags & EXTENT_FLAG_PINNED)
 			goto next;
@@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 		free_extent_map(em);
 		nr_dropped++;
 next:
-		if (*scanned >= nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan)
 			break;
 
 		/*
-		 * Restart if we had to reschedule, and any extent maps that were
-		 * pinned before may have become unpinned after we released the
-		 * lock and took it again.
+		 * Stop if we need to reschedule or there's contention on the
+		 * lock. This is to avoid slowing other tasks trying to take the
+		 * lock and because the shrinker might be called during a memory
+		 * allocation path and we want to avoid taking a very long time
+		 * and slowing down all sorts of tasks.
 		 */
-		if (cond_resched_rwlock_write(&tree->lock))
-			node = rb_first_cached(&tree->map);
+		if (need_resched() || rwlock_needbreak(&tree->lock))
+			break;
 	}
 	write_unlock(&tree->lock);
 	up_read(&inode->i_mmap_lock);
@@ -1102,25 +1122,30 @@ next:
 	return nr_dropped;
 }
 
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_inode *inode;
 	long nr_dropped = 0;
-	u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+	u64 min_ino = ctx->last_ino + 1;
 
 	inode = btrfs_find_first_inode(root, min_ino);
 	while (inode) {
-		nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+		nr_dropped += btrfs_scan_inode(inode, ctx);
 
 		min_ino = btrfs_ino(inode) + 1;
-		fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
-		iput(&inode->vfs_inode);
+		ctx->last_ino = btrfs_ino(inode);
+		btrfs_add_delayed_iput(inode);
 
-		if (*scanned >= nr_to_scan)
+		if (ctx->scanned >= ctx->nr_to_scan)
+			break;
+
+		/*
+		 * We may be called from memory allocation paths, so we don't
+		 * want to take too much time and slowdown tasks.
+		 */
+		if (need_resched())
 			break;
 
-		cond_resched();
 		inode = btrfs_find_first_inode(root, min_ino);
 	}
 
@@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 		 * inode if there is one or we will find out this was the last
 		 * one and move to the next root.
 		 */
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+		ctx->last_root = btrfs_root_id(root);
 	} else {
 		/*
 		 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
 		 * that when processing the next root we start from its first inode.
 		 */
-		fs_info->extent_map_shrinker_last_ino = 0;
-		fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+		ctx->last_ino = 0;
+		ctx->last_root = btrfs_root_id(root) + 1;
 	}
 
 	return nr_dropped;
@@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
 
 long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 {
-	const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
-	u64 next_root_id = start_root_id;
+	struct btrfs_em_shrink_ctx ctx;
+	u64 start_root_id;
+	u64 next_root_id;
 	bool cycled = false;
 	long nr_dropped = 0;
-	long scanned = 0;
+
+	ctx.scanned = 0;
+	ctx.nr_to_scan = nr_to_scan;
+
+	/*
+	 * In case we have multiple tasks running this shrinker, make the next
+	 * one start from the next inode in case it starts before we finish.
+	 */
+	spin_lock(&fs_info->extent_map_shrinker_lock);
+	ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+	fs_info->extent_map_shrinker_last_ino++;
+	ctx.last_root = fs_info->extent_map_shrinker_last_root;
+	spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+	start_root_id = ctx.last_root;
+	next_root_id = ctx.last_root;
 
 	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+							   nr, ctx.last_root,
+							   ctx.last_ino);
 	}
 
-	while (scanned < nr_to_scan) {
+	/*
+	 * We may be called from memory allocation paths, so we don't want to
+	 * take too much time and slowdown tasks, so stop if we need reschedule.
+	 */
+	while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
 		struct btrfs_root *root;
 		unsigned long count;
 
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 			spin_unlock(&fs_info->fs_roots_radix_lock);
 			if (start_root_id > 0 && !cycled) {
 				next_root_id = 0;
-				fs_info->extent_map_shrinker_last_root = 0;
-				fs_info->extent_map_shrinker_last_ino = 0;
+				ctx.last_root = 0;
+				ctx.last_ino = 0;
 				cycled = true;
 				continue;
 			}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 			continue;
 
 		if (is_fstree(btrfs_root_id(root)))
-			nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+			nr_dropped += btrfs_scan_root(root, &ctx);
 
 		btrfs_put_root(root);
 	}
 
+	/*
+	 * In case of multiple tasks running this extent map shrinking code this
+	 * isn't perfect but it's simple and silences things like KCSAN. It's
+	 * not possible to know which task made more progress because we can
+	 * cycle back to the first root and first inode if it's not the first
+	 * time the shrinker ran, see the above logic. Also a task that started
+	 * later may finish ealier than another task and made less progress. So
+	 * make this simple and update to the progress of the last task that
+	 * finished, with the occasional possiblity of having two consecutive
+	 * runs of the shrinker process the same inodes.
+	 */
+	spin_lock(&fs_info->extent_map_shrinker_lock);
+	fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+	fs_info->extent_map_shrinker_last_root = ctx.last_root;
+	spin_unlock(&fs_info->extent_map_shrinker_lock);
+
 	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
 		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+							  nr, ctx.last_root,
+							  ctx.last_ino);
 	}
 
 	return nr_dropped;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..833dc3fe0a38 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
 	s32 delalloc_batch;
 
 	struct percpu_counter evictable_extent_maps;
+	spinlock_t extent_map_shrinker_lock;
 	u64 extent_map_shrinker_last_root;
 	u64 extent_map_shrinker_last_ino;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..d62c96f00ff8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
 	args.ino = ino;
 	args.root = root;
 
-	inode = iget5_locked(s, hashval, btrfs_find_actor,
+	inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
 			     btrfs_init_locked_inode,
 			     (void *)&args);
 	return inode;
@@ -10385,7 +10385,7 @@ out_unlock:
 out_folios:
 	for (i = 0; i < nr_folios; i++) {
 		if (folios[i])
-			__folio_put(folios[i]);
+			folio_put(folios[i]);
 	}
 	kvfree(folios);
 out:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index efd5d6e9589e..6ad524b894fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
 		goto out_iov;
 
 	init_sync_kiocb(&kiocb, file);
-	ret = kiocb_set_rw_flags(&kiocb, 0);
+	ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
 	if (ret)
 		goto out_iov;
 	kiocb.ki_pos = pos;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bf0f81d59b6b..39a15cca58ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
 			       struct btrfs_qgroup_inherit *inherit,
 			       size_t size)
 {
-	if (!btrfs_qgroup_enabled(fs_info))
-		return 0;
 	if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
 		return -EOPNOTSUPP;
 	if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 
 	/*
+	 * Skip the inherit source qgroups check if qgroup is not enabled.
+	 * Qgroup can still be later enabled causing problems, but in that case
+	 * btrfs_qgroup_inherit() would just ignore those invalid ones.
+	 */
+	if (!btrfs_qgroup_enabled(fs_info))
+		return 0;
+
+	/*
 	 * Now check all the remaining qgroups, they should all:
 	 *
 	 * - Exist
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cf531255ab76..9522a8b79d22 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
 	u32 item_size = btrfs_item_size(leaf, slot);
 	unsigned long end, ptr;
 	u64 offset, flags, count;
-	int type, ret;
+	int type;
+	int ret = 0;
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
 						  key->objectid, key->offset);
 			break;
 		case BTRFS_EXTENT_OWNER_REF_KEY:
-			WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+			if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+				btrfs_err(fs_info,
+			  "found extent owner ref without simple quotas enabled");
+				ret = -EINVAL;
+			}
 			break;
 		default:
 			btrfs_err(fs_info, "invalid key type in iref");
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d620323d08ea..ae8c56442549 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 	 * "optimal" chunk size based on the fs size.  However when we actually
 	 * allocate the chunk we will strip this down further, making it no more
 	 * than 10% of the disk or 1G, whichever is smaller.
+	 *
+	 * On the zoned mode, we need to use zone_size (=
+	 * data_sinfo->chunk_size) as it is.
 	 */
 	data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
-	data_chunk_size = min(data_sinfo->chunk_size,
-			      mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
-	data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+	if (!btrfs_is_zoned(fs_info)) {
+		data_chunk_size = min(data_sinfo->chunk_size,
+				      mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+		data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+	} else {
+		data_chunk_size = data_sinfo->chunk_size;
+	}
 
 	/*
 	 * Since data allocations immediately use block groups as part of the
@@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
 		avail >>= 3;
 	else
 		avail >>= 1;
+
+	/*
+	 * On the zoned mode, we always allocate one zone as one chunk.
+	 * Returning non-zone size alingned bytes here will result in
+	 * less pressure for the async metadata reclaim process, and it
+	 * will over-commit too much leading to ENOSPC. Align down to the
+	 * zone size to avoid that.
+	 */
+	if (btrfs_is_zoned(fs_info))
+		avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
 	return avail;
 }