From 68d99ab0e9221ef54506f827576c5a914680eeaf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Mar 2023 09:13:05 +0900 Subject: btrfs: fix fast csum implementation detection The BTRFS_FS_CSUM_IMPL_FAST flag is currently set whenever a non-generic crc32c is detected, which is the incorrect check if the file system uses a different checksumming algorithm. Refactor the code to only check this if crc32c is actually used. Note that in an ideal world the information if an algorithm is hardware accelerated or not should be provided by the crypto API instead, but that's left for another day. CC: stable@vger.kernel.org # 5.4.x: c8a5f8ca9a9c: btrfs: print checksum type and implementation at mount time CC: stable@vger.kernel.org # 5.4.x Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b53f0e30ce2b..9e1596bb208d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2250,6 +2250,20 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + /* + * Check if the checksum implementation is a fast accelerated one. + * As-is this is a bit of a hack and should be replaced once the csum + * implementations provide that information themselves. + */ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; + default: + break; + } + btrfs_info(fs_info, "using %s (%s) checksum algorithm", btrfs_super_csum_name(csum_type), crypto_shash_driver_name(csum_shash)); -- cgit From 6989627db074a3db0ca297657bcb8709d8c888c0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Feb 2023 11:57:23 -0500 Subject: btrfs: drop root refs properly when orphan cleanup fails When we mount the file system we do something like this: while (1) { lookup fs roots; for (i = 0; i < num_roots; i++) { ret = btrfs_orphan_cleanup(roots[i]); if (ret) break; btrfs_put_root(roots[i]); } } for (; i < num_roots; i++) btrfs_put_root(roots[i]); As you can see if we break in that inner loop we just go back to the outer loop and lose the fact that we have to drop references on the remaining roots we looked up. Fix this by making an out label and jumping to that on error so we don't leak a reference to the roots we looked up. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9e1596bb208d..d9f66f411c02 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4404,12 +4404,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - break; + goto out; btrfs_put_root(gang[i]); } root_objectid++; } - +out: /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) -- cgit From dcb2137c8411d9940ebdfc93bb4de20b7f8fcf42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Feb 2023 19:10:22 +0100 Subject: btrfs: move all btree inode initialization into btrfs_init_btree_inode Move the remaining code that deals with initializing the btree inode into btrfs_init_btree_inode instead of splitting it between that helpers and its only caller. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d9f66f411c02..851792edf01f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2125,11 +2125,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->reloc_cancel_req, 0); } -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) +static int btrfs_init_btree_inode(struct super_block *sb) { - struct inode *inode = fs_info->btree_inode; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, fs_info->tree_root); + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2140,6 +2145,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, @@ -2152,6 +2158,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); + fs_info->btree_inode = inode; + + return 0; } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -3365,13 +3374,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; + ret = btrfs_init_btree_inode(sb); + if (ret) { + err = ret; goto fail; } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_init_btree_inode(fs_info); invalidate_bdev(fs_devices->latest_dev->bdev); -- cgit From 4871c33baf56b13559f1e7cb7c065df07c3f068e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 28 Feb 2023 08:44:30 +0800 Subject: btrfs: open_ctree() error handling cleanup Currently open_ctree() still uses two variables for error handling, err and ret. This can be confusing and missing some errors and does not conform to current coding style. This patch will fix the problems by: - Use only ret for error handling - Add proper ret assignment Originally we rely on the default value (-EINVAL) of err to handle errors, but that doesn't really reflects the error. This will change it use the correct error number for the following call sites: * subpage_info allocation * btrfs_free_extra_devids() * btrfs_check_rw_degradable() * cleaner_kthread allocation * transaction_kthread allocation - Add an extra ASSERT() To make sure we error out instead of returning 0. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 65 ++++++++++++++++++++++++++---------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 851792edf01f..8aff348369d7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3353,14 +3353,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *tree_root; struct btrfs_root *chunk_root; int ret; - int err = -EINVAL; int level; ret = init_mount_fs_info(fs_info, sb); - if (ret) { - err = ret; + if (ret) goto fail; - } /* These need to be init'ed before we start creating inodes and such. */ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, @@ -3370,15 +3367,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device GFP_KERNEL); fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail; } ret = btrfs_init_btree_inode(sb); - if (ret) { - err = ret; + if (ret) goto fail; - } invalidate_bdev(fs_devices->latest_dev->bdev); @@ -3387,7 +3382,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { - err = PTR_ERR(disk_super); + ret = PTR_ERR(disk_super); goto fail_alloc; } @@ -3399,7 +3394,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3408,7 +3403,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { - err = ret; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3419,7 +3413,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3449,12 +3443,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); - err = -EINVAL; + ret = -EINVAL; goto fail_alloc; } - if (!btrfs_super_root(disk_super)) + if (!btrfs_super_root(disk_super)) { + btrfs_err(fs_info, "invalid superblock tree root bytenr"); + ret = -EINVAL; goto fail_alloc; + } /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -3481,16 +3478,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->stripesize = stripesize; ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) { - err = ret; + if (ret) goto fail_alloc; - } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); - if (ret < 0) { - err = ret; + if (ret < 0) goto fail_alloc; - } if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3510,17 +3503,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) + if (!subpage_info) { + ret = -ENOMEM; goto fail_alloc; + } btrfs_init_subpage_info(subpage_info, sectorsize); fs_info->subpage_info = subpage_info; } ret = btrfs_init_workqueues(fs_info); - if (ret) { - err = ret; + if (ret) goto fail_sb_buffer; - } sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3566,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); + ret = -EIO; goto fail_tree_roots; } @@ -3581,8 +3575,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_get_dev_zone_info_all_devices(fs_info); if (ret) { btrfs_err(fs_info, - "zoned: failed to read device zone info: %d", - ret); + "zoned: failed to read device zone info: %d", ret); goto fail_block_groups; } @@ -3661,19 +3654,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); + ret = -EINVAL; goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); - if (IS_ERR(fs_info->cleaner_kthread)) + if (IS_ERR(fs_info->cleaner_kthread)) { + ret = PTR_ERR(fs_info->cleaner_kthread); goto fail_sysfs; + } fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); - if (IS_ERR(fs_info->transaction_kthread)) + if (IS_ERR(fs_info->transaction_kthread)) { + ret = PTR_ERR(fs_info->transaction_kthread); goto fail_cleaner; + } if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { @@ -3718,16 +3716,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); - if (ret) { - err = ret; + if (ret) goto fail_qgroup; - } } fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { - err = PTR_ERR(fs_info->fs_root); - btrfs_warn(fs_info, "failed to read fs tree: %d", err); + ret = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", ret); fs_info->fs_root = NULL; goto fail_qgroup; } @@ -3804,7 +3800,8 @@ fail_alloc: iput(fs_info->btree_inode); fail: btrfs_close_devices(fs_info->fs_devices); - return err; + ASSERT(ret < 0); + return ret; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); -- cgit From ce4cf3793e72a4ca2eeebc5db5d04f25b42a59db Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 2 Mar 2023 21:30:48 +0800 Subject: btrfs: remove redundant clearing of NODISCARD If no discard mount option is specified, including the NODISCARD option, we make the async discard the default option then we don't have to call the clear_opt again to clear the NODISCARD flag. Though this makes no difference, that the call is redundant has been pointed out several times so we better remove it. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8aff348369d7..991ff26fb8c6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3689,7 +3689,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->fs_devices->discardable) { btrfs_set_and_info(fs_info, DISCARD_ASYNC, "auto enabling async discard"); - btrfs_clear_opt(fs_info->mount_opt, NODISCARD); } #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -- cgit From 0b5485391deff35c6633d7cbff5cf3a8229fbe9e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Mar 2023 21:47:08 +0100 Subject: btrfs: locking: use atomic for DREW lock writers The DREW lock uses percpu variable to track lock counters and for that it needs to allocate the structure. In btrfs_read_tree_root() or btrfs_init_fs_root() this may add another error case or requires the NOFS scope protection. One way is to preallocate the structure as was suggested in https://lore.kernel.org/linux-btrfs/20221214021125.28289-1-robbieko@synology.com/ We may avoid the allocation altogether if we don't use the percpu variables but an atomic for the writer counter. This should not make any difference, the DREW lock is used for truncate and NOCOW writes along with other IO operations. The percpu counter for writers has been there since the original commit 8257b2dc3c1a1057 "Btrfs: introduce btrfs_{start, end}_nocow_write() for each subvolume". The reason could be to avoid hammering the same cacheline from all the readers but then the writers do that anyway. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +----------- fs/btrfs/locking.c | 25 ++++++------------------- fs/btrfs/locking.h | 5 ++--- 3 files changed, 9 insertions(+), 33 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 991ff26fb8c6..1b1b9e8197d6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; - unsigned int nofs_flag; - /* - * We might be called under a transaction (e.g. indirect backref - * resolution) which could deadlock if it triggers memory reclaim - */ - nofs_flag = memalloc_nofs_save(); - ret = btrfs_drew_lock_init(&root->snapshot_lock); - memalloc_nofs_restore(nofs_flag); - if (ret) - goto fail; + btrfs_drew_lock_init(&root->snapshot_lock); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root)) { @@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root) WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); - btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 870528d87526..3a496b0d3d2b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) * acquire the lock. */ -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) { - int ret; - - ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); - if (ret) - return ret; - atomic_set(&lock->readers, 0); + atomic_set(&lock->writers, 0); init_waitqueue_head(&lock->pending_readers); init_waitqueue_head(&lock->pending_writers); - - return 0; -} - -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) -{ - percpu_counter_destroy(&lock->writers); } /* Return true if acquisition is successful, false otherwise */ @@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) if (atomic_read(&lock->readers)) return false; - percpu_counter_inc(&lock->writers); + atomic_inc(&lock->writers); /* Ensure writers count is updated before we check for pending readers */ - smp_mb(); + smp_mb__after_atomic(); if (atomic_read(&lock->readers)) { btrfs_drew_write_unlock(lock); return false; @@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - percpu_counter_dec(&lock->writers); + atomic_dec(&lock->writers); cond_wake_up(&lock->pending_readers); } @@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) */ smp_mb__after_atomic(); - wait_event(lock->pending_readers, - percpu_counter_sum(&lock->writers) == 0); + wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); } void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 11c2269b4b6f..edb9b4a0dba1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) struct btrfs_drew_lock { atomic_t readers; - struct percpu_counter writers; + atomic_t writers; wait_queue_head_t pending_writers; wait_queue_head_t pending_readers; }; -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); -- cgit From a8fdc05172d0e52ad7812c07be8f753d63779539 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Mar 2023 11:13:49 +0000 Subject: btrfs: remove obsolete delayed ref throttling logic when truncating items We have this logic encapsulated in btrfs_should_throttle_delayed_refs() where we try to estimate if running the current amount of delayed references we have will take more than half a second, and if so, the caller btrfs_should_throttle_delayed_refs() should do something to prevent more and more delayed refs from being accumulated. This logic was added in commit 0a2b2a844af6 ("Btrfs: throttle delayed refs better") and then further refined in commit a79b7d4b3e81 ("Btrfs: async delayed refs"). The idea back then was that the caller of btrfs_should_throttle_delayed_refs() would release its transaction handle (by calling btrfs_end_transaction()) when that function returned true, then btrfs_end_transaction() would trigger an async job to run delayed references in a workqueue, and later start/join a transaction again and do more work. However we don't run delayed references asynchronously anymore, that was removed in commit db2462a6ad3d ("btrfs: don't run delayed refs in the end transaction logic"). That makes the logic that tries to estimate how long we will take to run our current delayed references, at btrfs_should_throttle_delayed_refs(), pointless as we don't take any action to run delayed references anymore. We do have other type of throttling, which consists of checking the size and reserved space of the delayed and global block reserves, as well as if fluhsing delayed references for the current transaction was already started, etc - this is all done by btrfs_should_end_transaction(), and the only user of btrfs_should_throttle_delayed_refs() does periodically call btrfs_should_end_transaction(). So remove btrfs_should_throttle_delayed_refs() and the infrastructure that keeps track of the average time used for running delayed references, as well as adapting btrfs_truncate_inode_items() to call btrfs_check_space_for_delayed_refs() instead. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 16 ---------------- fs/btrfs/delayed-ref.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/extent-tree.c | 27 ++------------------------- fs/btrfs/fs.h | 1 - fs/btrfs/inode-item.c | 12 +++++------- 6 files changed, 7 insertions(+), 51 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 3fdee91d8079..bf2ce51e5086 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -53,22 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) return ret; } -bool btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC / 2) - return true; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * Release a ref head's reservation. * diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 316fed159d49..6cf1adc9a01a 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -385,7 +385,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *src, u64 num_bytes); -bool btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b1b9e8197d6..1122ed8427b2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2965,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6b6c59e6805c..5cd289de4e92 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref, - unsigned long *run_refs) + struct btrfs_delayed_ref_head *locked_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - (*run_refs)++; ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); @@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; - ktime_t start = ktime_get(); int ret; unsigned long count = 0; - unsigned long actual_count = 0; delayed_refs = &trans->transaction->delayed_refs; do { @@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, cond_resched(); } while ((nr != -1 && count < nr) || locked_ref); - /* - * We don't want to include ref heads since we can have empty ref heads - * and those will drastically skew our runtime down since we just do - * accounting, no actual extent tree updates. - */ - if (actual_count > 0) { - u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); - u64 avg; - - /* - * We weigh the current average higher than our current runtime - * to avoid large swings in the average. - */ - spin_lock(&delayed_refs->lock); - avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ - spin_unlock(&delayed_refs->lock); - } return 0; } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index abe5b3475a9d..492436e1a59e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -407,7 +407,6 @@ struct btrfs_fs_info { * Must be written and read while holding btrfs_fs_info::commit_root_sem. */ u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; /* * This is updated to the current trans every time a full commit is diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b27c2c560083..4c322b720a80 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -527,7 +527,7 @@ search_again: while (1) { u64 clear_start = 0, clear_len = 0, extent_start = 0; - bool should_throttle = false; + bool refill_delayed_refs_rsv = false; fi = NULL; leaf = path->nodes[0]; @@ -685,10 +685,8 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } + if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) + refill_delayed_refs_rsv = true; } if (found_type == BTRFS_INODE_ITEM_KEY) @@ -696,7 +694,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle) { + refill_delayed_refs_rsv) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -719,7 +717,7 @@ delete: * actually allocate, so just bail if we're short and * let the normal reservation dance happen higher up. */ - if (should_throttle) { + if (refill_delayed_refs_rsv) { ret = btrfs_delayed_refs_rsv_refill(fs_info, BTRFS_RESERVE_NO_FLUSH); if (ret) { -- cgit From bfd3ea946faa8b653025bf56e4bb5428ee6fde05 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 27 Mar 2023 17:53:07 +0800 Subject: btrfs: move last_flush_error to write_dev_flush and wait_dev_flush We parallelize the flush command across devices using our own code, write_dev_flush() sends the flush command to each device and wait_dev_flush() waits for the flush to complete on all devices. Errors from each device are recorded at device->last_flush_error and reset to BLK_STS_OK in write_dev_flush() and to the error, if any, in wait_dev_flush(). These functions are called from barrier_all_devices(). This patch consolidates the use of device->last_flush_error in write_dev_flush() and wait_dev_flush() to remove it from barrier_all_devices(). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1122ed8427b2..0cb53762474a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4086,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; + device->last_flush_error = BLK_STS_OK; + #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * When a disk has write caching disabled, we skip submission of a bio @@ -4125,6 +4127,11 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); + if (bio->bi_status) { + device->last_flush_error = bio->bi_status; + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); + } + return bio->bi_status; } @@ -4159,7 +4166,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ @@ -4175,12 +4181,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; ret = wait_dev_flush(dev); - if (ret) { - dev->last_flush_error = ret; - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_FLUSH_ERRS); + if (ret) errors_wait++; - } } if (errors_wait) { -- cgit From de38a206ff74cc8b2d1e253d8f7b2979d55069de Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 27 Mar 2023 17:53:08 +0800 Subject: btrfs: open code check_barrier_error() check_barrier_error() is almost a single line function, and just calls btrfs_check_rw_degradable(). Instead, open code it. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0cb53762474a..bd0e2c595fd6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4135,13 +4135,6 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) return bio->bi_status; } -static int check_barrier_error(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_check_rw_degradable(fs_info, NULL)) - return -EIO; - return 0; -} - /* * send an empty flush down to each device in parallel, * then wait for them @@ -4185,14 +4178,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info) errors_wait++; } - if (errors_wait) { - /* - * At some point we need the status of all disks - * to arrive at the volume status. So error checking - * is being pushed to a separate loop. - */ - return check_barrier_error(info); - } + /* + * Checks last_flush_error of disks in order to determine the device + * state. + */ + if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + return -EIO; + return 0; } -- cgit From 1b465784dc331bc70ae42ad4a4035ed60251e316 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 27 Mar 2023 17:53:09 +0800 Subject: btrfs: change wait_dev_flush() return type to bool The flush error code is maintained in btrfs_device::last_flush_error, so there is no point in returning it in wait_dev_flush() when it is not being used. Instead, we can return a boolean value. Note that even though btrfs_device::last_flush_error may not be used, we will keep it for now. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bd0e2c595fd6..fb4f88faeace 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4116,13 +4116,14 @@ static void write_dev_flush(struct btrfs_device *device) /* * If the flush bio has been submitted by write_dev_flush, wait for it. + * Return true for any error, and false otherwise. */ -static blk_status_t wait_dev_flush(struct btrfs_device *device) +static bool wait_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) - return BLK_STS_OK; + return false; clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); @@ -4130,9 +4131,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) if (bio->bi_status) { device->last_flush_error = bio->bi_status; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); + return true; } - return bio->bi_status; + return false; } /* @@ -4144,7 +4146,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct list_head *head; struct btrfs_device *dev; int errors_wait = 0; - blk_status_t ret; lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ @@ -4173,8 +4174,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; - ret = wait_dev_flush(dev); - if (ret) + if (wait_dev_flush(dev)) errors_wait++; } -- cgit From 7e812f2054b8eeaca2c61ae9dc8986e9190e4dd6 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 27 Mar 2023 17:53:10 +0800 Subject: btrfs: use test_and_clear_bit() in wait_dev_flush() The function wait_dev_flush() tests for the BTRFS_DEV_STATE_FLUSH_SENT bit and then clears it separately. Instead, use test_and_clear_bit(). Though we don't need to do the atomic test and clear, it's following a common pattern. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fb4f88faeace..59ea049fe7ee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4122,10 +4122,9 @@ static bool wait_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; - if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) + if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return false; - clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); if (bio->bi_status) { -- cgit From 1d6a4fc85717677e00fefffd847a50fc5928ce69 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 28 Apr 2023 14:13:05 +0800 Subject: btrfs: make clear_cache mount option to rebuild FST without disabling it Previously clear_cache mount option would simply disable free-space-tree feature temporarily then re-enable it to rebuild the whole free space tree. But this is problematic for block-group-tree feature, as we have an artificial dependency on free-space-tree feature. If we go the existing method, after clearing the free-space-tree feature, we would flip the filesystem to read-only mode, as we detect a super block write with block-group-tree but no free-space-tree feature. This patch would change the behavior by properly rebuilding the free space tree without disabling this feature, thus allowing clear_cache mount option to work with block group tree. Now we can mount a filesystem with block-group-tree feature and clear_mount option: $ mkfs.btrfs -O block-group-tree /dev/test/scratch1 -f $ sudo mount /dev/test/scratch1 /mnt/btrfs -o clear_cache $ sudo dmesg -t | head -n 5 BTRFS info (device dm-1): force clearing of disk cache BTRFS info (device dm-1): using free space tree BTRFS info (device dm-1): auto enabling async discard BTRFS info (device dm-1): rebuilding free space tree BTRFS info (device dm-1): checking UUID tree CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 25 ++++++++++++++++------- fs/btrfs/free-space-tree.c | 50 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/free-space-tree.h | 3 ++- fs/btrfs/super.c | 3 +-- 4 files changed, 70 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59ea049fe7ee..fbf9006c6234 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3121,23 +3121,34 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); - bool clear_free_space_tree = false; + bool rebuild_free_space_tree = false; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = true; + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = true; + rebuild_free_space_tree = true; } - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); + if (rebuild_free_space_tree) { + btrfs_info(fs_info, "rebuilding free space tree"); + ret = btrfs_rebuild_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); + "failed to rebuild free space tree: %d", ret); + goto out; + } + } + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "disabling free space tree"); + ret = btrfs_delete_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to disable free space tree: %d", ret); goto out; } } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 4d155a48ec59..b21da1446f2a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1252,7 +1252,7 @@ out: return ret; } -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; @@ -1298,6 +1298,54 @@ abort: return ret; } +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *block_group; + + block_group = rb_entry(node, struct btrfs_block_group, + cache_node); + ret = populate_free_space_tree(trans, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + + ret = btrfs_commit_transaction(trans); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; +abort: + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index dc2463e4cfe3..6d5551d0ced8 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -18,7 +18,8 @@ struct btrfs_caching_control; void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0f2f915e42b0..ec18e2210602 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -828,8 +828,7 @@ out: ret = -EINVAL; } if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - (btrfs_test_opt(info, CLEAR_CACHE) || - !btrfs_test_opt(info, FREE_SPACE_TREE))) { + !btrfs_test_opt(info, FREE_SPACE_TREE)) { btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); ret = -EINVAL; } -- cgit From 597441b3436a43011f31ce71dc0a6c0bf5ce958a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 May 2023 12:45:59 -0400 Subject: btrfs: use nofs when cleaning up aborted transactions Our CI system caught a lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc7+ #1167 Not tainted ------------------------------------------------------ kswapd0/46 is trying to acquire lock: ffff8c6543abd650 (sb_internal#2){++++}-{0:0}, at: btrfs_commit_inode_delayed_inode+0x5f/0x120 but task is already holding lock: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0xa5/0xe0 kmem_cache_alloc+0x31/0x2c0 alloc_extent_state+0x1d/0xd0 __clear_extent_bit+0x2e0/0x4f0 try_release_extent_mapping+0x216/0x280 btrfs_release_folio+0x2e/0x90 invalidate_inode_pages2_range+0x397/0x470 btrfs_cleanup_dirty_bgs+0x9e/0x210 btrfs_cleanup_one_transaction+0x22/0x760 btrfs_commit_transaction+0x3b7/0x13a0 create_subvol+0x59b/0x970 btrfs_mksubvol+0x435/0x4f0 __btrfs_ioctl_snap_create+0x11e/0x1b0 btrfs_ioctl_snap_create_v2+0xbf/0x140 btrfs_ioctl+0xa45/0x28f0 __x64_sys_ioctl+0x88/0xc0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc -> #0 (sb_internal#2){++++}-{0:0}: __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 start_transaction+0x401/0x730 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 kthread+0xf0/0x120 ret_from_fork+0x29/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sb_internal#2); lock(fs_reclaim); lock(sb_internal#2); *** DEADLOCK *** 3 locks held by kswapd0/46: #0: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 #1: ffffffffabe50270 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x113/0x290 #2: ffff8c6543abd0e0 (&type->s_umount_key#44){++++}-{3:3}, at: super_cache_scan+0x38/0x1f0 stack backtrace: CPU: 0 PID: 46 Comm: kswapd0 Not tainted 6.3.0-rc7+ #1167 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack_lvl+0x58/0x90 check_noncircular+0xd6/0x100 ? save_trace+0x3f/0x310 ? add_lock_to_list+0x97/0x120 __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 start_transaction+0x401/0x730 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 ? lock_release+0x134/0x270 ? __pfx_wake_bit_function+0x10/0x10 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 ? __pfx_inode_lru_isolate+0x10/0x10 ? __pfx_inode_lru_isolate+0x10/0x10 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_kswapd+0x10/0x10 kthread+0xf0/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x50 This happens because when we abort the transaction in the transaction commit path we call invalidate_inode_pages2_range on our block group cache inodes (if we have space cache v1) and any delalloc inodes we may have. The plain invalidate_inode_pages2_range() call passes through GFP_KERNEL, which makes sense in most cases, but not here. Wrap these two invalidate callees with memalloc_nofs_save/memalloc_nofs_restore to make sure we don't end up with the fs reclaim dependency under the transaction dependency. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fbf9006c6234..6de6dcf2743e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4936,7 +4936,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); iput(inode); } spin_lock(&root->delalloc_lock); @@ -5042,7 +5046,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) inode = cache->io_ctl.inode; if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode); -- cgit From 5ad9b4719fc9bc4715c7e19875a962095b0577e7 Mon Sep 17 00:00:00 2001 From: pengfuyuan Date: Tue, 23 May 2023 15:09:55 +0800 Subject: btrfs: fix csum_tree_block page iteration to avoid tripping on -Werror=array-bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling on a MIPS 64-bit machine we get these warnings: In file included from ./arch/mips/include/asm/cacheflush.h:13, from ./include/linux/cacheflush.h:5, from ./include/linux/highmem.h:8, from ./include/linux/bvec.h:10, from ./include/linux/blk_types.h:10, from ./include/linux/blkdev.h:9, from fs/btrfs/disk-io.c:7: fs/btrfs/disk-io.c: In function ‘csum_tree_block’: fs/btrfs/disk-io.c:100:34: error: array subscript 1 is above array bounds of ‘struct page *[1]’ [-Werror=array-bounds] 100 | kaddr = page_address(buf->pages[i]); | ~~~~~~~~~~^~~ ./include/linux/mm.h:2135:48: note: in definition of macro ‘page_address’ 2135 | #define page_address(page) lowmem_page_address(page) | ^~~~ cc1: all warnings being treated as errors We can check if i overflows to solve the problem. However, this doesn't make much sense, since i == 1 and num_pages == 1 doesn't execute the body of the loop. In addition, i < num_pages can also ensure that buf->pages[i] will not cross the boundary. Unfortunately, this doesn't help with the problem observed here: gcc still complains. To fix this add a compile-time condition for the extent buffer page array size limit, which would eventually lead to eliminating the whole for loop. CC: stable@vger.kernel.org # 5.10+ Signed-off-by: pengfuyuan Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6de6dcf2743e..2b1b227505f3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -96,7 +96,7 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); - for (i = 1; i < num_pages; i++) { + for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { kaddr = page_address(buf->pages[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } -- cgit From 917ac77846b907dfdbd878688a9a61236ad6c51e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 26 May 2023 20:30:20 +0800 Subject: btrfs: subpage: fix a crash in metadata repair path [BUG] Test case btrfs/027 would crash with subpage (64K page size, 4K sectorsize) with the following dying messages: debug: map_length=16384 length=65536 type=metadata|raid6(0x104) assertion failed: map_length >= length, in fs/btrfs/volumes.c:8093 ------------[ cut here ]------------ kernel BUG at fs/btrfs/messages.c:259! Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: btrfs_assertfail+0x28/0x2c [btrfs] btrfs_map_repair_block+0x150/0x2b8 [btrfs] btrfs_repair_io_failure+0xd4/0x31c [btrfs] btrfs_read_extent_buffer+0x150/0x16c [btrfs] read_tree_block+0x38/0xbc [btrfs] read_tree_root_path+0xfc/0x1bc [btrfs] btrfs_get_root_ref.part.0+0xd4/0x3a8 [btrfs] open_ctree+0xa30/0x172c [btrfs] btrfs_mount_root+0x3c4/0x4a4 [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xec vfs_kern_mount.part.0+0x90/0xd4 vfs_kern_mount+0x14/0x28 btrfs_mount+0x114/0x418 [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xec path_mount+0x3e0/0xb64 __arm64_sys_mount+0x200/0x2d8 invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x60/0x11c do_el0_svc+0x38/0x98 el0_svc+0x40/0xa8 el0t_64_sync_handler+0xf4/0x120 el0t_64_sync+0x190/0x194 Code: aa0403e2 b0fff060 91010000 959c2024 (d4210000) [CAUSE] In btrfs/027 we test RAID6 with missing devices, in this particular case, we're repairing a metadata at the end of a data stripe. But at btrfs_repair_io_failure(), we always pass a full PAGE for repair, and for subpage case this can cross stripe boundary and lead to the above BUG_ON(). This metadata repair code is always there, since the introduction of subpage support, but this can trigger BUG_ON() after the bio split ability at btrfs_map_bio(). [FIX] Instead of passing the old PAGE_SIZE, we calculate the correct length based on the eb size and page size for both regular and subpage cases. CC: stable@vger.kernel.org # 6.3+ Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b1b227505f3..88e6d1072a35 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -242,7 +242,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -251,12 +250,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; + u64 start = max_t(u64, eb->start, page_offset(p)); + u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + u32 len = end - start; - ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, - start, p, start - page_offset(p), mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, + start, p, offset_in_page(start), mirror_num); if (ret) break; - start += PAGE_SIZE; } return ret; -- cgit From 745806fb4554f334e6406fa82b328562aa48f08f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 11 Jun 2023 08:09:13 +0800 Subject: btrfs: do not ASSERT() on duplicated global roots [BUG] Syzbot reports a reproducible ASSERT() when using rescue=usebackuproot mount option on a corrupted fs. The full report can be found here: https://syzkaller.appspot.com/bug?extid=c4614eae20a166c25bf0 BTRFS error (device loop0: state C): failed to load root csum assertion failed: !tmp, in fs/btrfs/disk-io.c:1103 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3664! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3608 Comm: syz-executor356 Not tainted 6.0.0-rc7-syzkaller-00029-g3800a713b607 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 RIP: 0010:assertfail+0x1a/0x1c fs/btrfs/ctree.h:3663 RSP: 0018:ffffc90003aaf250 EFLAGS: 00010246 RAX: 0000000000000032 RBX: 0000000000000000 RCX: f21c13f886638400 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff888021c640a0 R08: ffffffff816bd38d R09: ffffed10173667f1 R10: ffffed10173667f1 R11: 1ffff110173667f0 R12: dffffc0000000000 R13: ffff8880229c21f7 R14: ffff888021c64060 R15: ffff8880226c0000 FS: 0000555556a73300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a2637d7a00 CR3: 00000000709c4000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_global_root_insert+0x1a7/0x1b0 fs/btrfs/disk-io.c:1103 load_global_roots_objectid+0x482/0x8c0 fs/btrfs/disk-io.c:2467 load_global_roots fs/btrfs/disk-io.c:2501 [inline] btrfs_read_roots fs/btrfs/disk-io.c:2528 [inline] init_tree_roots+0xccb/0x203c fs/btrfs/disk-io.c:2939 open_ctree+0x1e53/0x33df fs/btrfs/disk-io.c:3574 btrfs_fill_super+0x1c6/0x2d0 fs/btrfs/super.c:1456 btrfs_mount_root+0x885/0x9a0 fs/btrfs/super.c:1824 legacy_get_tree+0xea/0x180 fs/fs_context.c:610 vfs_get_tree+0x88/0x270 fs/super.c:1530 fc_mount fs/namespace.c:1043 [inline] vfs_kern_mount+0xc9/0x160 fs/namespace.c:1073 btrfs_mount+0x3d3/0xbb0 fs/btrfs/super.c:1884 [CAUSE] Since the introduction of global roots, we handle csum/extent/free-space-tree roots as global roots, even if no extent-tree-v2 feature is enabled. So for regular csum/extent/fst roots, we load them into fs_info::global_root_tree rb tree. And we should not expect any conflicts in that rb tree, thus we have an ASSERT() inside btrfs_global_root_insert(). But rescue=usebackuproot can break the assumption, as we will try to load those trees again and again as long as we have bad roots and have backup roots slot remaining. So in that case we can have conflicting roots in the rb tree, and triggering the ASSERT() crash. [FIX] We can safely remove that ASSERT(), as the caller will properly put the offending root. To make further debugging easier, also add two explicit error messages: - Error message for conflicting global roots - Error message when using backup roots slot Reported-by: syzbot+a694851c6ab28cbcfb9c@syzkaller.appspotmail.com Fixes: abed4aaae4f7 ("btrfs: track the csum, extent, and free space trees in a rb tree") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 88e6d1072a35..dabc79c1af1b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -996,13 +996,18 @@ int btrfs_global_root_insert(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *tmp; + int ret = 0; write_lock(&fs_info->global_root_lock); tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); write_unlock(&fs_info->global_root_lock); - ASSERT(!tmp); - return tmp ? -EEXIST : 0; + if (tmp) { + ret = -EEXIST; + btrfs_warn(fs_info, "global root %llu %llu already exists", + root->root_key.objectid, root->root_key.offset); + } + return ret; } void btrfs_global_root_delete(struct btrfs_root *root) @@ -2842,6 +2847,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* We can't trust the free space cache either */ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; if (ret < 0) -- cgit