aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/block-group.c13
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/extent_map.c123
-rw-r--r--fs/btrfs/fs.h1
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/qgroup.c10
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/space-info.c24
10 files changed, 148 insertions, 42 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1a66be33bb04..60066822b532 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1924,8 +1924,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
next:
if (ret) {
/* Refcount held by the reclaim_bgs list after splice. */
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * This block group might be added to the unused list
+ * during the above process. Move it back to the
+ * reclaim list otherwise.
+ */
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &retry_list);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
}
btrfs_put_block_group(bg);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38cdb8875e8e..cabb558dbdaa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f688fab55251..958155cc43a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3553,7 +3553,7 @@ err:
for (int i = 0; i < num_folios; i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
- __folio_put(eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..b4c9a6aa118c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1028,7 +1028,14 @@ out_free_pre:
return ret;
}
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- write_lock(&tree->lock);
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
node = rb_first_cached(&tree->map);
while (node) {
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
node = rb_next(node);
- (*scanned)++;
+ ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
- * Restart if we had to reschedule, and any extent maps that were
- * pinned before may have become unpinned after we released the
- * lock and took it again.
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
*/
- if (cond_resched_rwlock_write(&tree->lock))
- node = rb_first_cached(&tree->map);
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
@@ -1102,25 +1122,30 @@ next:
return nr_dropped;
}
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+ u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
- iput(&inode->vfs_inode);
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
break;
- cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+ ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- fs_info->extent_map_shrinker_last_ino = 0;
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
- const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
- u64 next_root_id = start_root_id;
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- long scanned = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
- while (scanned < nr_to_scan) {
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- fs_info->extent_map_shrinker_last_root = 0;
- fs_info->extent_map_shrinker_last_ino = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
cycled = true;
continue;
}
@@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
- nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
return nr_dropped;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..833dc3fe0a38 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -630,6 +630,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..d62c96f00ff8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -10385,7 +10385,7 @@ out_unlock:
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
- __folio_put(folios[i]);
+ folio_put(folios[i]);
}
kvfree(folios);
out:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index efd5d6e9589e..6ad524b894fc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4627,7 +4627,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_iov;
init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
goto out_iov;
kiocb.ki_pos = pos;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bf0f81d59b6b..39a15cca58ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
- if (!btrfs_qgroup_enabled(fs_info))
- return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3085,6 +3083,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
return -EINVAL;
/*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
* Now check all the remaining qgroups, they should all:
*
* - Exist
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cf531255ab76..9522a8b79d22 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
key->objectid, key->offset);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
- WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
break;
default:
btrfs_err(fs_info, "invalid key type in iref");
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d620323d08ea..ae8c56442549 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
* "optimal" chunk size based on the fs size. However when we actually
* allocate the chunk we will strip this down further, making it no more
* than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (=
+ * data_sinfo->chunk_size) as it is.
*/
data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ if (!btrfs_is_zoned(fs_info)) {
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ } else {
+ data_chunk_size = data_sinfo->chunk_size;
+ }
/*
* Since data allocations immediately use block groups as part of the
@@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}