From fe816d0f1d4c31c4c31d42ca78a87660565fc800 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 27 Apr 2018 12:21:53 +0300 Subject: btrfs: Fix delalloc inodes invalidation during transaction abort When a transaction is aborted btrfs_cleanup_transaction is called to cleanup all the various in-flight bits and pieces which migth be active. One of those is delalloc inodes - inodes which have dirty pages which haven't been persisted yet. Currently the process of freeing such delalloc inodes in exceptional circumstances such as transaction abort boiled down to calling btrfs_invalidate_inodes whose sole job is to invalidate the dentries for all inodes related to a root. This is in fact wrong and insufficient since such delalloc inodes will likely have pending pages or ordered-extents and will be linked to the sb->s_inode_list. This means that unmounting a btrfs instance with an aborted transaction could potentially lead inodes/their pages visible to the system long after their superblock has been freed. This in turn leads to a "use-after-free" situation once page shrink is triggered. This situation could be simulated by running generic/019 which would cause such inodes to be left hanging, followed by generic/176 which causes memory pressure and page eviction which lead to touching the freed super block instance. This situation is additionally detected by the unmount code of VFS with the following message: "VFS: Busy inodes after unmount of Self-destruct in 5 seconds. Have a nice day..." Additionally btrfs hits WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); in free_fs_root for the same reason. This patch aims to rectify the sitaution by doing the following: 1. Change btrfs_destroy_delalloc_inodes so that it calls invalidate_inode_pages2 for every inode on the delalloc list, this ensures that all the pages of the inode are released. This function boils down to calling btrfs_releasepage. During test I observed cases where inodes on the delalloc list were having an i_count of 0, so this necessitates using igrab to be sure we are working on a non-freed inode. 2. Since calling btrfs_releasepage might queue delayed iputs move the call out to btrfs_cleanup_transaction in btrfs_error_commit_super before calling run_delayed_iputs for the last time. This is necessary to ensure that delayed iputs are run. Note: this patch is tagged for 4.14 stable but the fix applies to older versions too but needs to be backported manually due to conflicts. CC: stable@vger.kernel.org # 4.14.x: 2b8773313494: btrfs: Split btrfs_del_delalloc_inode into 2 functions CC: stable@vger.kernel.org # 4.14.x Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ add comment to igrab ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 60caa68c3618..c3504b4d281b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3818,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); + ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", @@ -4125,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(fs_info); + mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - - /* cleanup FS via transaction */ - btrfs_cleanup_transaction(fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) @@ -4258,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { + struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - - list_del_init(&btrfs_inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &btrfs_inode->runtime_flags); + __btrfs_del_delalloc_inode(root, btrfs_inode); spin_unlock(&root->delalloc_lock); - btrfs_invalidate_inodes(btrfs_inode->root); - + /* + * Make sure we get a live inode and that it'll not disappear + * meanwhile. + */ + inode = igrab(&btrfs_inode->vfs_inode); + if (inode) { + invalidate_inode_pages2(inode->i_mapping); + iput(inode); + } spin_lock(&root->delalloc_lock); } - spin_unlock(&root->delalloc_lock); } @@ -4286,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - list_del_init(&root->delalloc_root); root = btrfs_grab_fs_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); -- cgit From dccdb07bc996e9c8de80d06813163ca08288bf73 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 21 Mar 2018 00:20:05 +0100 Subject: btrfs: kill btrfs_fs_info::volume_mutex Mutual exclusion of device add/rm and balance was done by the volume mutex up to version 3.7. The commit 5ac00addc7ac091109 ("Btrfs: disallow mutually exclusive admin operations from user mode") added a bit that essentially tracked the same information. The status bit has an advantage over a mutex that it can be set without restrictions of function context, so it started to be used in the mount-time resuming of balance or device replace. But we don't really need to track the same information in two ways. 1) After the previous cleanups, the main ioctl handlers for add/del/resize copy the EXCL_OP bit next to the volume mutex, here it's clearly safe. 2) Resuming balance during mount or after rw remount will set only the EXCL_OP bit and the volume_mutex is held in the kernel thread that calls btrfs_balance. 3) Resuming device replace during mount or after rw remount is done after balance and is excluded by the EXCL_OP bit. It does not take the volume_mutex at all and completely relies on the EXCL_OP bit. 4) The resuming of balance and dev-replace cannot hapen at the same time as the ioctls cannot be started in parallel. Nevertheless, a crafted image could trigger that and a warning is printed. 5) Balance is normally excluded by EXCL_OP and also uses own mutex to protect against concurrent access to its status data. There's some trickery to maintain the right lock nesting in case we need to reexamine the status in btrfs_ioctl_balance. The volume_mutex is removed and the unlock/lock sequence is left in place as we might expect other waiters to proceed. 6) Similar to 5, the unlock/lock sequence is kept in btrfs_cancel_balance to allow waiters to continue. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/extent-tree.c | 2 +- fs/btrfs/ioctl.c | 17 ++++------------- fs/btrfs/volumes.c | 29 +++++++---------------------- 5 files changed, 12 insertions(+), 38 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index de86f2217816..add0e5a3f415 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -838,7 +838,6 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; - struct mutex volume_mutex; /* * this is taken to make sure we don't set block groups ro after diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c3504b4d281b..49a990c8493e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2601,7 +2601,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); - mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index df79340332ad..60e65df8134d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4124,7 +4124,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * returns target flags in extended format or 0 if restripe for this * chunk_type is not in progress * - * should be called with either volume_mutex or balance_lock held + * should be called with balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c759f2d1301..c690092e8380 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1457,7 +1457,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1565,7 +1564,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; @@ -2432,7 +2430,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2447,7 +2444,6 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2480,7 +2476,6 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } - mutex_lock(&fs_info->volume_mutex); if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); @@ -2488,7 +2483,6 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { @@ -2524,7 +2518,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } - mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { @@ -2539,7 +2532,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -4358,7 +4350,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) again: if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -4375,8 +4366,10 @@ again: /* this is either (2) or (3) */ if (!atomic_read(&fs_info->balance_running)) { mutex_unlock(&fs_info->balance_mutex); - if (!mutex_trylock(&fs_info->volume_mutex)) - goto again; + /* + * Lock released to allow other waiters to continue, + * we'll reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && @@ -4387,7 +4380,6 @@ again: } mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); goto again; } else { /* this is (2) */ @@ -4480,7 +4472,6 @@ out_bargs: kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); if (need_unlock) clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9e5d27dd00b7..0216bc86f476 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -167,12 +167,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * - * volume_mutex - * ------------ - * coarse lock owned by a mounted filesystem; used to exclude some operations - * that cannot run in parallel and affect the higher-level properties of the - * filesystem like: device add/deleting/resize/replace, or balance - * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -3206,9 +3200,8 @@ static void update_balance_args(struct btrfs_balance_control *bctl) } /* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for reset_balance_state. + * Should be called with balance mutex held to protect against checking the + * balance status or progress. Same goes for reset_balance_state. */ static void set_balance_control(struct btrfs_balance_control *bctl) { @@ -3785,7 +3778,7 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* - * Should be called with both balance and volume mutexes held + * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) @@ -3951,16 +3944,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { btrfs_info(fs_info, "continuing balance"); ret = btrfs_balance(fs_info->balance_ctl, NULL); } - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); return ret; } @@ -4054,13 +4043,9 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "cannot set exclusive op status to balance, resume manually"); - mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); - set_balance_control(bctl); - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); out: btrfs_free_path(path); return ret; @@ -4117,17 +4102,17 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) atomic_read(&fs_info->balance_running) == 0); mutex_lock(&fs_info->balance_mutex); } else { - /* reset_balance_state needs volume_mutex */ mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); + /* + * Lock released to allow other waiters to continue, we'll + * reexamine the status again. + */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { reset_balance_state(fs_info); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } - - mutex_unlock(&fs_info->volume_mutex); } BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); -- cgit From 3009a62f3b18230a000d1a91e9a676036487e834 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 21 Mar 2018 01:31:04 +0100 Subject: btrfs: track running balance in a simpler way Currently fs_info::balance_running is 0 or 1 and does not use the semantics of atomics. The pause and cancel check for 0, that can happen only after __btrfs_balance exits for whatever reason. Parallel calls to balance ioctl may enter btrfs_ioctl_balance multiple times but will block on the balance_mutex that protects the fs_info::flags bit. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ++++++- fs/btrfs/disk-io.c | 1 - fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/volumes.c | 18 ++++++++++-------- 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index add0e5a3f415..8fdc97312b61 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -739,6 +739,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_NEED_ASYNC_COMMIT 17 +/* + * Indicate that balance has been set up from the ioctl and is in the main + * phase. The fs_info::balance_ctl is initialized. + */ +#define BTRFS_FS_BALANCE_RUNNING 18 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -1003,7 +1009,6 @@ struct btrfs_fs_info { /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; - atomic_t balance_running; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 49a990c8493e..7503ff1dd6f0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2164,7 +2164,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c690092e8380..ffb224b1c051 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4312,7 +4312,7 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, bargs->flags = bctl->flags; - if (atomic_read(&fs_info->balance_running)) + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; @@ -4364,7 +4364,7 @@ again: mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { /* this is either (2) or (3) */ - if (!atomic_read(&fs_info->balance_running)) { + if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); /* * Lock released to allow other waiters to continue, @@ -4373,7 +4373,7 @@ again: mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && - !atomic_read(&fs_info->balance_running)) { + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* this is (3) */ need_unlock = false; goto locked; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0216bc86f476..ed230247ae5d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3907,13 +3907,14 @@ int btrfs_balance(struct btrfs_balance_control *bctl, spin_unlock(&fs_info->balance_lock); } - atomic_inc(&fs_info->balance_running); + ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); + set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { memset(bargs, 0, sizeof(*bargs)); @@ -4061,16 +4062,16 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return -ENOTCONN; } - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req); mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); + BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_pause_req); } else { ret = -ENOTCONN; @@ -4096,10 +4097,10 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) * if we are running just wait and return, balance item is * deleted in btrfs_balance in this case */ - if (atomic_read(&fs_info->balance_running)) { + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); mutex_lock(&fs_info->balance_mutex); } else { mutex_unlock(&fs_info->balance_mutex); @@ -4115,7 +4116,8 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) } } - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + BUG_ON(fs_info->balance_ctl || + test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); atomic_dec(&fs_info->balance_cancel_req); mutex_unlock(&fs_info->balance_mutex); return 0; -- cgit From 41a6e8913cdff6b30ac53a24641259d117c0b101 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 25 Apr 2018 19:01:43 +0800 Subject: btrfs: move btrfs_raid_group values to btrfs_raid_attr table Add a new member struct btrfs_raid_attr::bg_flag so that btrfs_raid_array can maintain the bit map flag of the raid type, and so we can drop btrfs_raid_group. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/volumes.c | 19 ++++++++----------- fs/btrfs/volumes.h | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7503ff1dd6f0..47dbbe496253 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3521,7 +3521,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE) continue; - if (!(flags & btrfs_raid_group[raid_type])) + if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; min_tolerated = min(min_tolerated, btrfs_raid_array[raid_type]. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ed262d44ff8a..fdd6ac9ee2c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4180,7 +4180,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) /* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min) - allowed |= btrfs_raid_group[raid_type]; + allowed |= btrfs_raid_array[raid_type].bg_flag; } allowed &= flags; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6abd0ebb3fea..4fc22a696206 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -41,6 +41,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 2, .ncopies = 2, .raid_name = "raid10", + .bg_flag = BTRFS_BLOCK_GROUP_RAID10, }, [BTRFS_RAID_RAID1] = { .sub_stripes = 1, @@ -51,6 +52,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 2, .ncopies = 2, .raid_name = "raid1", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1, }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 1, .ncopies = 2, .raid_name = "dup", + .bg_flag = BTRFS_BLOCK_GROUP_DUP, }, [BTRFS_RAID_RAID0] = { .sub_stripes = 1, @@ -71,6 +74,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 1, .ncopies = 1, .raid_name = "raid0", + .bg_flag = BTRFS_BLOCK_GROUP_RAID0, }, [BTRFS_RAID_SINGLE] = { .sub_stripes = 1, @@ -81,6 +85,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 1, .ncopies = 1, .raid_name = "single", + .bg_flag = 0, }, [BTRFS_RAID_RAID5] = { .sub_stripes = 1, @@ -91,6 +96,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 1, .ncopies = 2, .raid_name = "raid5", + .bg_flag = BTRFS_BLOCK_GROUP_RAID5, }, [BTRFS_RAID_RAID6] = { .sub_stripes = 1, @@ -101,6 +107,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_increment = 1, .ncopies = 3, .raid_name = "raid6", + .bg_flag = BTRFS_BLOCK_GROUP_RAID6, }, }; @@ -112,16 +119,6 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } -const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, - [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, - [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, - [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, - [BTRFS_RAID_SINGLE] = 0, - [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, - [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, -}; - /* * Table to convert BTRFS_RAID_* to the error code if minimum number of devices * condition is not met. Zero means there's no corresponding @@ -1899,7 +1896,7 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, } while (read_seqretry(&fs_info->profiles_lock, seq)); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { - if (!(all_avail & btrfs_raid_group[i])) + if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; if (num_devices < btrfs_raid_array[i].devs_min) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9056a900aace..b26f53462e8d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -330,11 +330,11 @@ struct btrfs_raid_attr { int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ const char raid_name[8]; /* name of the raid */ + u64 bg_flag; /* block group flag of the raid */ }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; -extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; -- cgit From 21a852b01820bdb543df2728cf2f39ecf565255d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 May 2018 13:35:25 +0800 Subject: btrfs: Move btrfs_check_super_valid() to avoid forward declaration Move btrfs_check_super_valid() before its single caller to avoid forward declaration. Though such code motion is not recommended as it pollutes git history, in this case the following patches would need to add new forward declarations for static functions that we want to avoid. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 299 ++++++++++++++++++++++++++--------------------------- 1 file changed, 149 insertions(+), 150 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 47dbbe496253..6dff0028d69a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,7 +55,6 @@ static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -2441,6 +2440,155 @@ out: return ret; } +static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); + int ret = 0; + + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + btrfs_err(fs_info, "no valid FS found"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { + btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); + ret = -EINVAL; + } + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "tree_root level too big: %d >= %d", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "chunk_root level too big: %d >= %d", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + btrfs_err(fs_info, "log_root level too big: %d >= %d", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + + /* + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. + */ + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_SIZE) { + btrfs_err(fs_info, + "sectorsize %llu not supported yet, only support %lu", + sectorsize, PAGE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + btrfs_err(fs_info, "invalid nodesize %llu", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + btrfs_err(fs_info, "invalid leafsize %u, should be %llu", + le32_to_cpu(sb->__unused_leafsize), nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { + btrfs_warn(fs_info, "tree_root block unaligned: %llu", + btrfs_super_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { + btrfs_warn(fs_info, "chunk_root block unaligned: %llu", + btrfs_super_chunk_root(sb)); + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + btrfs_warn(fs_info, "log_root block unaligned: %llu", + btrfs_super_log_root(sb)); + ret = -EINVAL; + } + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match fsid: %pU != %pU", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_num_devices(sb) > (1UL << 31)) + btrfs_warn(fs_info, "suspicious number of devices: %llu", + btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + btrfs_err(fs_info, "number of devices is 0"); + ret = -EINVAL; + } + + if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { + btrfs_err(fs_info, "super offset mismatch %llu != %u", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + btrfs_err(fs_info, "system chunk array too big %u > %u", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + btrfs_err(fs_info, "system chunk array too small %u < %zu", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + btrfs_warn(fs_info, + "suspicious: generation < chunk_root_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) + btrfs_warn(fs_info, + "suspicious: generation < cache_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_cache_generation(sb)); + + return ret; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -3973,155 +4121,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, level, first_key); } -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *sb = fs_info->super_copy; - u64 nodesize = btrfs_super_nodesize(sb); - u64 sectorsize = btrfs_super_sectorsize(sb); - int ret = 0; - - if (btrfs_super_magic(sb) != BTRFS_MAGIC) { - btrfs_err(fs_info, "no valid FS found"); - ret = -EINVAL; - } - if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { - btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", - btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); - ret = -EINVAL; - } - if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "tree_root level too big: %d >= %d", - btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "chunk_root level too big: %d >= %d", - btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "log_root level too big: %d >= %d", - btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); - ret = -EINVAL; - } - - /* - * Check sectorsize and nodesize first, other check will need it. - * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. - */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || - sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); - ret = -EINVAL; - } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { - btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", - sectorsize, PAGE_SIZE); - ret = -EINVAL; - } - if (!is_power_of_2(nodesize) || nodesize < sectorsize || - nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { - btrfs_err(fs_info, "invalid nodesize %llu", nodesize); - ret = -EINVAL; - } - if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { - btrfs_err(fs_info, "invalid leafsize %u, should be %llu", - le32_to_cpu(sb->__unused_leafsize), nodesize); - ret = -EINVAL; - } - - /* Root alignment check */ - if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { - btrfs_warn(fs_info, "tree_root block unaligned: %llu", - btrfs_super_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { - btrfs_warn(fs_info, "chunk_root block unaligned: %llu", - btrfs_super_chunk_root(sb)); - ret = -EINVAL; - } - if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { - btrfs_warn(fs_info, "log_root block unaligned: %llu", - btrfs_super_log_root(sb)); - ret = -EINVAL; - } - - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); - ret = -EINVAL; - } - - /* - * Hint to catch really bogus numbers, bitflips or so, more exact checks are - * done later - */ - if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { - btrfs_err(fs_info, "bytes_used is too small %llu", - btrfs_super_bytes_used(sb)); - ret = -EINVAL; - } - if (!is_power_of_2(btrfs_super_stripesize(sb))) { - btrfs_err(fs_info, "invalid stripesize %u", - btrfs_super_stripesize(sb)); - ret = -EINVAL; - } - if (btrfs_super_num_devices(sb) > (1UL << 31)) - btrfs_warn(fs_info, "suspicious number of devices: %llu", - btrfs_super_num_devices(sb)); - if (btrfs_super_num_devices(sb) == 0) { - btrfs_err(fs_info, "number of devices is 0"); - ret = -EINVAL; - } - - if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); - ret = -EINVAL; - } - - /* - * Obvious sys_chunk_array corruptions, it must hold at least one key - * and one chunk - */ - if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - btrfs_err(fs_info, "system chunk array too big %u > %u", - btrfs_super_sys_array_size(sb), - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); - ret = -EINVAL; - } - if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)) { - btrfs_err(fs_info, "system chunk array too small %u < %zu", - btrfs_super_sys_array_size(sb), - sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)); - ret = -EINVAL; - } - - /* - * The generation is a global counter, we'll trust it more than the others - * but it's still possible that it's the one that's wrong. - */ - if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) - btrfs_warn(fs_info, - "suspicious: generation < chunk_root_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_chunk_root_generation(sb)); - if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) - && btrfs_super_cache_generation(sb) != (u64)-1) - btrfs_warn(fs_info, - "suspicious: generation < cache_generation: %llu < %llu", - btrfs_super_generation(sb), - btrfs_super_cache_generation(sb)); - - return ret; -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ -- cgit From 069ec957c35e03ba3beb40973379899cfdbf1ee1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 May 2018 13:35:26 +0800 Subject: btrfs: Refactor btrfs_check_super_valid Refactor btrfs_check_super_valid: 1) Rename it to btrfs_validate_mount_super() Now it's more obvious when the function should be called. 2) Extract core check routine into validate_super() Later write time check can reuse it, and if needed, we could also use validate_super() to check each super block. 3) Add more comments about btrfs_validate_mount_super() Mostly about what it doesn't check and when it should be called. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ rename to validate_super ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6dff0028d69a..eff867370036 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2440,9 +2440,19 @@ out: return ret; } -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) +/* + * Real super block validation + * NOTE: super csum type and incompat features will not be checked here. + * + * @sb: super block to check + * @mirror_num: the super block number to check its bytenr: + * 0 the primary (1st) sb + * 1, 2 2nd and 3rd backup copy + * -1 skip bytenr check + */ +static int validate_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb, int mirror_num) { - struct btrfs_super_block *sb = fs_info->super_copy; u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; @@ -2545,7 +2555,8 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) ret = -EINVAL; } - if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { + if (mirror_num >= 0 && + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { btrfs_err(fs_info, "super offset mismatch %llu != %u", btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; @@ -2589,6 +2600,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) return ret; } +/* + * Validation of super block at mount time. + * Some checks already done early at mount time, like csum type and incompat + * flags will be skipped. + */ +static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) +{ + return validate_super(fs_info, fs_info->super_copy, 0); +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2814,7 +2835,7 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - ret = btrfs_check_super_valid(fs_info); + ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; -- cgit From 75cb857d2618cca810b8bf13ba5b2ceaaf26ba3d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 11 May 2018 13:35:27 +0800 Subject: btrfs: Do super block verification before writing it to disk There are already 2 reports about strangely corrupted super blocks, where csum still matches but extra garbage gets slipped into super block. The corruption would looks like: ------ superblock: bytenr=65536, device=/dev/sdc1 --------------------------------------------------------- csum_type 41700 (INVALID) csum 0x3b252d3a [match] bytenr 65536 flags 0x1 ( WRITTEN ) magic _BHRfS_M [match] ... incompat_flags 0x5b22400000000169 ( MIXED_BACKREF | COMPRESS_LZO | BIG_METADATA | EXTENDED_IREF | SKINNY_METADATA | unknown flag: 0x5b22400000000000 ) ... ------ Or ------ superblock: bytenr=65536, device=/dev/mapper/x --------------------------------------------------------- csum_type 35355 (INVALID) csum_size 32 csum 0xf0dbeddd [match] bytenr 65536 flags 0x1 ( WRITTEN ) magic _BHRfS_M [match] ... incompat_flags 0x176d200000000169 ( MIXED_BACKREF | COMPRESS_LZO | BIG_METADATA | EXTENDED_IREF | SKINNY_METADATA | unknown flag: 0x176d200000000000 ) ------ Obviously, csum_type and incompat_flags get some garbage, but its csum still matches, which means kernel calculates the csum based on corrupted super block memory. And after manually fixing these values, the filesystem is completely healthy without any problem exposed by btrfs check. Although the cause is still unknown, at least detect it and prevent further corruption. Both reports have same symptoms, there's an overwrite on offset 192 of the superblock, by 4 bytes. The superblock structure is not allocated or freed and stays in the memory for the whole filesystem lifetime, so it's not a use-after-free kind of error on someone else's leaked page. As a vague point for the problable cause is mentioning of other system freezing related to graphic card drivers. Reported-by: Ken Swenson Reported-by: Ben Parsons <9parsonsb@gmail.com> Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ add brief analysis of the reports ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eff867370036..a16385091572 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2610,6 +2610,41 @@ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) return validate_super(fs_info, fs_info->super_copy, 0); } +/* + * Validation of super block at write time. + * Some checks like bytenr check will be skipped as their values will be + * overwritten soon. + * Extra checks like csum type and incompat flags will be done here. + */ +static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, + struct btrfs_super_block *sb) +{ + int ret; + + ret = validate_super(fs_info, sb, -1); + if (ret < 0) + goto out; + if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + ret = -EUCLEAN; + btrfs_err(fs_info, "invalid csum type, has %u want %u", + btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); + goto out; + } + if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "invalid incompat flags, has 0x%llx valid mask 0x%llx", + btrfs_super_incompat_flags(sb), + (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); + goto out; + } +out: + if (ret < 0) + btrfs_err(fs_info, + "super block corruption detected before writing it to disk"); + return ret; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -3770,6 +3805,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + ret = btrfs_validate_write_super(fs_info, sb); + if (ret < 0) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, -EUCLEAN, + "unexpected superblock corruption detected"); + return -EUCLEAN; + } + ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; -- cgit From a575ceeb1338e7eae6d14e223b077b3c6fd3bb6b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 11 May 2018 13:13:38 -0700 Subject: Btrfs: get rid of unused orphan infrastructure Now that we don't keep long-standing reservations for orphan items, root->orphan_block_rsv isn't used. We can git rid of it, along with: - root->orphan_lock, which was used to protect root->orphan_block_rsv - root->orphan_inodes, which was used as a refcount for root->orphan_block_rsv - BTRFS_INODE_ORPHAN_META_RESERVED, which was used to track reservations in root->orphan_block_rsv - btrfs_orphan_commit_root(), which was the last user of any of these and does nothing else Reviewed-by: Nikolay Borisov Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 1 - fs/btrfs/ctree.h | 8 -------- fs/btrfs/disk-io.c | 9 --------- fs/btrfs/extent-tree.c | 38 -------------------------------------- fs/btrfs/inode.c | 43 +------------------------------------------ fs/btrfs/transaction.c | 1 - 6 files changed, 1 insertion(+), 99 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index cb7dc0aa4253..4807cde0313d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,6 @@ * new data the application may have written before commit. */ #define BTRFS_INODE_ORDERED_DATA_CLOSE 0 -#define BTRFS_INODE_ORPHAN_META_RESERVED 1 #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_HAS_ASYNC_EXTENT 5 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 71aecf0b7cf5..bbb358143ded 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1223,9 +1223,6 @@ struct btrfs_root { spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; - spinlock_t orphan_lock; - atomic_t orphan_inodes; - struct btrfs_block_rsv *orphan_block_rsv; int orphan_cleanup_state; spinlock_t inode_lock; @@ -2768,9 +2765,6 @@ void btrfs_delalloc_release_space(struct inode *inode, void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -void btrfs_orphan_release_metadata(struct btrfs_inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, @@ -3228,8 +3222,6 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a16385091572..d8d3b73680ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1184,7 +1184,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; - root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); @@ -1194,7 +1193,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); - spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); @@ -1215,7 +1213,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); - atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); root->log_transid = 0; @@ -3884,8 +3881,6 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv); - root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); if (root->subv_writers) @@ -3976,7 +3971,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) void close_ctree(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); @@ -4072,9 +4066,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); - __btrfs_free_block_rsv(root->orphan_block_rsv); - root->orphan_block_rsv = NULL; - while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47edb3af3f9f..ccf2690f7ca1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5949,44 +5949,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) trans->chunk_bytes_reserved = 0; } -/* Can only return 0 or -ENOSPC */ -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - /* - * We always use trans->block_rsv here as we will have reserved space - * for our orphan when starting the transaction, using get_block_rsv() - * here will sometimes make us choose the wrong block rsv as we could be - * doing a reloc inode for a non refcounted root. - */ - struct btrfs_block_rsv *src_rsv = trans->block_rsv; - struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; - - /* - * We need to hold space in order to delete our orphan item once we've - * added it, so this takes the reservation so we can release it later - * when we are truly done with the orphan item. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 1); - return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -} - -void btrfs_orphan_release_metadata(struct btrfs_inode *inode) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), - num_bytes, 0); - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); -} - /* * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation * root: the root of the parent directory diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f8af40d02595..b247ea31c436 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3297,42 +3297,6 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delayed_iput_lock); } -/* - * This is called in transaction commit time. If there are no orphan - * files in the subvolume, it removes orphan item and frees block_rsv - * structure. - */ -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - - if (atomic_read(&root->orphan_inodes) || - root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) - return; - - spin_lock(&root->orphan_lock); - if (atomic_read(&root->orphan_inodes)) { - spin_unlock(&root->orphan_lock); - return; - } - - if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { - spin_unlock(&root->orphan_lock); - return; - } - - block_rsv = root->orphan_block_rsv; - root->orphan_block_rsv = NULL; - spin_unlock(&root->orphan_lock); - - if (block_rsv) { - WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(fs_info, block_rsv); - } -} - /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. @@ -3526,12 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - if (root->orphan_block_rsv) - btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, - (u64)-1); - - if (root->orphan_block_rsv || - test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ff841abb756e..2544acc33045 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1245,7 +1245,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_orphan_commit_root(trans, root); btrfs_save_ino_cache(root, trans); -- cgit From ff76a864cc94d055b4239fb2cd13e8e633dc7aac Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 18 May 2018 10:59:35 +0800 Subject: Btrfs: add parent_transid parameter to veirfy_level_key As verify_level_key() is checked after verify_parent_transid(), i.e. if (verify_parent_transid()) ret = -EIO; else if (verify_level_key()) ret = -EUCLEAN; if parent_transid is 0, verify_parent_transid() skips verifying parent_transid and considers eb as valid, and if verify_level_key() reports something wrong, we're not going to know if it's caused by corrupted metadata or non-checkecd eb (e.g. stale eb). The stale eb can be from an outdated raid1 mirror after a degraded mount, see eg "btrfs: fix reading stale metadata blocks after degraded raid1 mounts" (02a3307aa9c20b4f66262) for more details. @parent_transid is able to tell whether the eb's generation has been verified by the caller. Signed-off-by: Liu Bo Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d8d3b73680ef..205092dc9390 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -415,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, static int verify_level_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int level, - struct btrfs_key *first_key) + struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; @@ -453,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, if (ret) { WARN_ON(1); btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)", - eb->start, first_key->objectid, first_key->type, - first_key->offset, found_key.objectid, - found_key.type, found_key.offset); +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); } #endif return ret; @@ -492,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, parent_transid, 0)) ret = -EIO; else if (verify_level_key(fs_info, eb, level, - first_key)) + first_key, parent_transid)) ret = -EUCLEAN; else break; -- cgit