From 313b085851c13ca08320372a05a7047ea25d3dd4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:26 -0400 Subject: btrfs: move btrfs_scratch_superblocks into btrfs_dev_replace_finishing We need to move the closing of the src_device out of all the device replace locking, but we definitely want to zero out the superblock before we commit the last time to make sure the device is properly removed. Handle this by pushing btrfs_scratch_superblocks into btrfs_dev_replace_finishing, and then later on we'll move the src_device closing and freeing stuff where we need it to be. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db93909b25e0..7cf48aeb6f14 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -745,6 +745,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); btrfs_sysfs_update_devid(tgt_device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) + btrfs_scratch_superblocks(fs_info, src_device->bdev, + src_device->name->str); btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ -- cgit From a466c85edc6fbe845facc8f57c408c544f42899e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:27 -0400 Subject: btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks When closing and freeing the source device we could end up doing our final blkdev_put() on the bdev, which will grab the bd_mutex. As such we want to be holding as few locks as possible, so move this call outside of the dev_replace->lock_finishing_cancel_unmount lock. Since we're modifying the fs_devices we need to make sure we're holding the uuid_mutex here, so take that as well. There's a report from syzbot probably hitting one of the cases where the bd_mutex and device_list_mutex are taken in the wrong order, however it's not with device replace, like this patch fixes. As there's no reproducer available so far, we can't verify the fix. https://lore.kernel.org/lkml/000000000000fc04d105afcf86d7@google.com/ dashboard link: https://syzkaller.appspot.com/bug?extid=84a0634dc5d21d488419 WARNING: possible circular locking dependency detected 5.9.0-rc5-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.0/6878 is trying to acquire lock: ffff88804c17d780 (&bdev->bd_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x520 fs/block_dev.c:1804 but task is already holding lock: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 btrfs_finish_chunk_alloc+0x281/0xf90 fs/btrfs/volumes.c:5255 btrfs_create_pending_block_groups+0x2f3/0x700 fs/btrfs/block-group.c:2109 __btrfs_end_transaction+0xf5/0x690 fs/btrfs/transaction.c:916 find_free_extent_update_loop fs/btrfs/extent-tree.c:3807 [inline] find_free_extent+0x23b7/0x2e60 fs/btrfs/extent-tree.c:4127 btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206 cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063 btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838 writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439 __extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653 extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249 extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370 do_writepages+0xec/0x290 mm/page-writeback.c:2352 __writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461 writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721 wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894 wb_do_writeback fs/fs-writeback.c:2039 [inline] wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 -> #3 (sb_internal#2){.+.+}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] __sb_start_write+0x234/0x470 fs/super.c:1672 sb_start_intwrite include/linux/fs.h:1690 [inline] start_transaction+0xbe7/0x1170 fs/btrfs/transaction.c:624 find_free_extent_update_loop fs/btrfs/extent-tree.c:3789 [inline] find_free_extent+0x25e1/0x2e60 fs/btrfs/extent-tree.c:4127 btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206 cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063 btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838 writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439 __extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653 extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249 extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370 do_writepages+0xec/0x290 mm/page-writeback.c:2352 __writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461 writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721 wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894 wb_do_writeback fs/fs-writeback.c:2039 [inline] wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 -> #2 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}: __flush_work+0x60e/0xac0 kernel/workqueue.c:3041 wb_shutdown+0x180/0x220 mm/backing-dev.c:355 bdi_unregister+0x174/0x590 mm/backing-dev.c:872 del_gendisk+0x820/0xa10 block/genhd.c:933 loop_remove drivers/block/loop.c:2192 [inline] loop_control_ioctl drivers/block/loop.c:2291 [inline] loop_control_ioctl+0x3b1/0x480 drivers/block/loop.c:2257 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (loop_ctl_mutex){+.+.}-{3:3}: __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 lo_open+0x19/0xd0 drivers/block/loop.c:1893 __blkdev_get+0x759/0x1aa0 fs/block_dev.c:1507 blkdev_get fs/block_dev.c:1639 [inline] blkdev_open+0x227/0x300 fs/block_dev.c:1753 do_dentry_open+0x4b9/0x11b0 fs/open.c:817 do_open fs/namei.c:3251 [inline] path_openat+0x1b9a/0x2730 fs/namei.c:3368 do_filp_open+0x17e/0x3c0 fs/namei.c:3395 do_sys_openat2+0x16d/0x420 fs/open.c:1168 do_sys_open fs/open.c:1184 [inline] __do_sys_open fs/open.c:1192 [inline] __se_sys_open fs/open.c:1188 [inline] __x64_sys_open+0x119/0x1c0 fs/open.c:1188 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&bdev->bd_mutex){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:2496 [inline] check_prevs_add kernel/locking/lockdep.c:2601 [inline] validate_chain kernel/locking/lockdep.c:3218 [inline] __lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426 lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 blkdev_put+0x30/0x520 fs/block_dev.c:1804 btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline] btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline] btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline] close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149 generic_shutdown_super+0x144/0x370 fs/super.c:464 kill_anon_super+0x36/0x60 fs/super.c:1108 btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265 deactivate_locked_super+0x94/0x160 fs/super.c:335 deactivate_super+0xad/0xd0 fs/super.c:366 cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118 task_work_run+0xdd/0x190 kernel/task_work.c:141 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:163 [inline] exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190 syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: &bdev->bd_mutex --> sb_internal#2 --> &fs_devs->device_list_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_devs->device_list_mutex); lock(sb_internal#2); lock(&fs_devs->device_list_mutex); lock(&bdev->bd_mutex); *** DEADLOCK *** 3 locks held by syz-executor.0/6878: #0: ffff88809070c0e0 (&type->s_umount_key#70){++++}-{3:3}, at: deactivate_super+0xa5/0xd0 fs/super.c:365 #1: ffffffff8a5b37a8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_close_devices+0x23/0x1f0 fs/btrfs/volumes.c:1178 #2: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159 stack backtrace: CPU: 0 PID: 6878 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 check_noncircular+0x324/0x3e0 kernel/locking/lockdep.c:1827 check_prev_add kernel/locking/lockdep.c:2496 [inline] check_prevs_add kernel/locking/lockdep.c:2601 [inline] validate_chain kernel/locking/lockdep.c:3218 [inline] __lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426 lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103 blkdev_put+0x30/0x520 fs/block_dev.c:1804 btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline] btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline] btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline] close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149 generic_shutdown_super+0x144/0x370 fs/super.c:464 kill_anon_super+0x36/0x60 fs/super.c:1108 btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265 deactivate_locked_super+0x94/0x160 fs/super.c:335 deactivate_super+0xad/0xd0 fs/super.c:366 cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118 task_work_run+0xdd/0x190 kernel/task_work.c:141 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_user_mode_loop kernel/entry/common.c:163 [inline] exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190 syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x460027 RSP: 002b:00007fff59216328 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000000000076035 RCX: 0000000000460027 RDX: 0000000000403188 RSI: 0000000000000002 RDI: 00007fff592163d0 RBP: 0000000000000333 R08: 0000000000000000 R09: 000000000000000b R10: 0000000000000005 R11: 0000000000000246 R12: 00007fff59217460 R13: 0000000002df2a60 R14: 0000000000000000 R15: 00007fff59217460 Signed-off-by: Josef Bacik [ add syzbot reference ] Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 ++- fs/btrfs/volumes.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7cf48aeb6f14..62aece1bb8ec 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -748,7 +748,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, src_device->name->str); - btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -757,6 +756,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + btrfs_rm_dev_replace_free_srcdev(src_device); + return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 01ebdb69fbdb..1997a7d67f22 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2224,6 +2224,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + mutex_lock(&uuid_mutex); + btrfs_close_bdev(srcdev); synchronize_rcu(); btrfs_free_device(srcdev); @@ -2252,6 +2254,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) close_fs_devices(fs_devices); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) -- cgit From 4c8f353272dd1262013873990c0fafd0e3c8f274 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 23 Sep 2020 15:30:16 +0100 Subject: btrfs: fix filesystem corruption after a device replace We use a device's allocation state tree to track ranges in a device used for allocated chunks, and we set ranges in this tree when allocating a new chunk. However after a device replace operation, we were not setting the allocated ranges in the new device's allocation state tree, so that tree is empty after a device replace. This means that a fitrim operation after a device replace will trim the device ranges that have allocated chunks and extents, as we trim every range for which there is not a range marked in the device's allocation state tree. It is also important during chunk allocation, since the device's allocation state is used to determine if a range is already allocated when allocating a new chunk. This is trivial to reproduce and the following script triggers the bug: $ cat reproducer.sh #!/bin/bash DEV1="/dev/sdg" DEV2="/dev/sdh" DEV3="/dev/sdi" wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null # Create a raid1 test fs on 2 devices. mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null mount $DEV1 /mnt/btrfs xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo echo "Starting to replace $DEV1 with $DEV3" btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs echo echo "Running fstrim" fstrim /mnt/btrfs echo echo "Unmounting filesystem" umount /mnt/btrfs echo "Mounting filesystem in degraded mode using $DEV3 only" wipefs -a $DEV1 $DEV2 &> /dev/null mount -o degraded $DEV3 /mnt/btrfs if [ $? -ne 0 ]; then dmesg | tail echo echo "Failed to mount in degraded mode" exit 1 fi echo echo "File foo data (expected all bytes = 0xab):" od -A d -t x1 /mnt/btrfs/foo umount /mnt/btrfs When running the reproducer: $ ./replace-test.sh wrote 10485760/10485760 bytes at offset 0 10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec) Starting to replace /dev/sdg with /dev/sdi Running fstrim Unmounting filesystem Mounting filesystem in degraded mode using /dev/sdi only mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error. [19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started [19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished [19582.208293] BTRFS info (device sdi): allowing degraded mounts [19582.208298] BTRFS info (device sdi): disk space caching is enabled [19582.208301] BTRFS info (device sdi): has skinny extents [19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing [19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed [19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0 [19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5 [19582.231576] BTRFS error (device sdi): open_ctree failed Failed to mount in degraded mode So fix by setting all allocated ranges in the replace target device when the replace operation is finishing, when we are holding the chunk mutex and we can not race with new chunk allocations. A test case for fstests follows soon. Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree") CC: stable@vger.kernel.org # 5.2+ Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62aece1bb8ec..e4a1c6afe35d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -599,6 +599,37 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) wake_up(&fs_info->dev_replace.replace_wait); } +/* + * When finishing the device replace, before swapping the source device with the + * target device we must update the chunk allocation state in the target device, + * as it is empty because replace works by directly copying the chunks and not + * through the normal chunk allocation path. + */ +static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_state *cached_state = NULL; + u64 start = 0; + u64 found_start; + u64 found_end; + int ret = 0; + + lockdep_assert_held(&srcdev->fs_info->chunk_mutex); + + while (!find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = set_extent_bits(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED); + if (ret) + break; + start = found_end + 1; + } + + free_extent_state(cached_state); + return ret; +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -673,8 +704,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = ktime_get_real_seconds(); dev_replace->item_needs_writeback = 1; - /* replace old device with new one in mapping tree */ + /* + * Update allocation state in the new device and replace the old device + * with the new one in the mapping tree. + */ if (!scrub_ret) { + scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); + if (scrub_ret) + goto error; btrfs_dev_replace_update_device_in_mapping_tree(fs_info, src_device, tgt_device); @@ -685,6 +722,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); +error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); -- cgit From b49121393f583635b78abae4037fa1e772b2a042 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:12 -0400 Subject: btrfs: change nr to u64 in btrfs_start_delalloc_roots We have btrfs_wait_ordered_roots() which takes a u64 for nr, but btrfs_start_delalloc_roots() that takes an int for nr, which makes using them in conjunction, especially for something like (u64)-1, annoying and inconsistent. Fix btrfs_start_delalloc_roots() to take a u64 for nr and adjust start_delalloc_inodes() and it's callers appropriately. This means we've adjusted start_delalloc_inodes() to take a pointer of nr since we want to preserve the ability for start-delalloc_inodes() to return an error, so simply make it do the nr adjusting as necessary. Part of adjusting the callers to this means changing btrfs_writeback_inodes_sb_nr() to take a u64 for items. This may be confusing because it seems unrelated, but the caller of btrfs_writeback_inodes_sb_nr() already passes in a u64, it's just the function variable that needs to be changed. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 26 ++++++++++---------------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/space-info.c | 2 +- 5 files changed, 14 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..e2ec4f067857 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2956,7 +2956,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e4a1c6afe35d..61d800a149b0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -661,7 +661,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index df08ace5d914..48b8b5f09f47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9387,7 +9387,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9427,9 +9427,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9454,18 +9456,15 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9488,15 +9487,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2d9109d9e98f..a5355a16eabb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4897,7 +4897,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..266ff14e4c8c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -477,7 +477,7 @@ again: } static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) + unsigned long nr_pages, u64 nr_items) { struct super_block *sb = fs_info->sb; -- cgit From c3e1f96c37d0f863ac6ddcafac6921fd46f1aa37 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Aug 2020 10:02:32 -0500 Subject: btrfs: enumerate the type of exclusive operation in progress Instead of using a flag bit for exclusive operation, use a variable to store which exclusive operation is being performed. Introduce an API to start and finish an exclusive operation. This would enable another way for tools to check which operation is running on why starting an exclusive operation failed. The followup patch adds a sysfs_notify() to alert userspace when the state changes, so userspace can perform select() on it to get notified of the change. This would enable us to enqueue a command which will wait for current exclusive operation to complete before issuing the next exclusive operation. This has been done synchronously as opposed to a background process, or else error collection (if any) will become difficult. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba [ update comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 24 +++++++++++++++++++----- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/inode.c | 14 +++++++------- fs/btrfs/ioctl.c | 44 +++++++++++++++++++++++++++----------------- fs/btrfs/volumes.c | 20 ++++++++++---------- 5 files changed, 65 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af2bd059daae..98c5f6178efc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,11 +541,6 @@ enum { BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, - /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. @@ -566,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -937,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -3032,6 +3043,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61d800a149b0..14901ca2508d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -1025,7 +1025,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1062,7 +1062,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a66145aac710..cce6f8789a4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10015,14 +10015,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10168,7 +10168,7 @@ out: if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b91444e810a5..f30231596fe0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,17 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -1639,7 +1650,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1753,7 +1764,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -3127,7 +3138,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3144,7 +3155,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3173,7 +3184,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3184,7 +3195,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3215,7 +3226,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3233,7 +3244,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3737,11 +3748,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3952,7 +3963,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3998,7 +4009,6 @@ again: } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4053,10 +4063,10 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4075,7 +4085,7 @@ out_bargs: out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c86ffad04641..cbebb5d06773 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -4033,7 +4033,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4169,7 +4169,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4180,7 +4180,7 @@ out: reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4282,7 +4282,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4364,7 +4364,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } -- cgit From c6a5d954950c5031444173ad2195efc163afcac9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:22 +0800 Subject: btrfs: fix replace of seed device If you replace a seed device in a sprouted fs, it appears to have successfully replaced the seed device, but if you look closely, it didn't. Here is an example. $ mkfs.btrfs /dev/sda $ btrfstune -S1 /dev/sda $ mount /dev/sda /btrfs $ btrfs device add /dev/sdb /btrfs $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs $ btrfs replace start -f /dev/sda /dev/sdc /btrfs $ echo $? 0 BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc started BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc finished $ btrfs fi show Label: none uuid: ab2c88b7-be81-4a7e-9849-c3666e7f9f4f Total devices 2 FS bytes used 256.00KiB devid 1 size 3.00GiB used 520.00MiB path /dev/sdc devid 2 size 3.00GiB used 896.00MiB path /dev/sdb Label: none uuid: 10bd3202-0415-43af-96a8-d5409f310a7e Total devices 1 FS bytes used 128.00KiB devid 1 size 3.00GiB used 536.00MiB path /dev/sda So as per the replace start command and kernel log replace was successful. Now let's try to clean mount. $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sdc /dev/sdb /btrfs mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error. [ 636.157517] BTRFS error (device sdc): failed to read chunk tree: -2 [ 636.180177] BTRFS error (device sdc): open_ctree failed That's because per dev items it is still looking for the original seed device. $ btrfs inspect-internal dump-tree -d /dev/sdb item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98 devid 1 total_bytes 3221225472 bytes_used 545259520 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 6 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 59368f50-9af2-4b17-91da-8a783cc418d4 <--- seed uuid fsid 10bd3202-0415-43af-96a8-d5409f310a7e <--- seed fsid item 1 key (DEV_ITEMS DEV_ITEM 2) itemoff 16087 itemsize 98 devid 2 total_bytes 3221225472 bytes_used 939524096 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 0 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 56a0a6bc-4630-4998-8daf-3c3030c4256a <- sprout uuid fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f <- sprout fsid But the replaced target has the following uuid+fsid in its superblock which doesn't match with the expected uuid+fsid in its devitem. $ btrfs in dump-super /dev/sdc | egrep '^generation|dev_item.uuid|dev_item.fsid|devid' generation 20 dev_item.uuid 59368f50-9af2-4b17-91da-8a783cc418d4 dev_item.fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f [match] dev_item.devid 1 So if you provide the original seed device the mount shall be successful. Which so long happening in the test case btrfs/163. $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs Fix in this patch: If a seed is not sprouted then there is no replacement of it, because of its read-only filesystem with a read-only device. Similarly, in the case of a sprouted filesystem, the seed device is still read only. So, mark it as you can't replace a seed device, you can only add a new device and then delete the seed device. If replace is attempted then returns -EINVAL. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 14901ca2508d..bbb3d5ec9ca8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -230,7 +230,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } -- cgit From cd36da2e7ec67f0da9c1efc8a87ad31bd9242ec3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:26 +0800 Subject: btrfs: simplify parameters of btrfs_sysfs_add_devices_dir When we add a device we need to add it to sysfs, so instead of using the btrfs_sysfs_add_devices_dir() fs_devices argument to specify whether to add a device or all of fs_devices, call the helper function directly btrfs_sysfs_add_device() and thus make it non-static. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 11 ++++------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 2 +- 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bbb3d5ec9ca8..c176375c2c02 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -512,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 66fbb82b0bd5..bc341560dc69 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1303,7 +1303,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -static int btrfs_sysfs_add_device(struct btrfs_device *device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; @@ -1352,13 +1352,10 @@ out: return ret; } -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; - - if (device) - return btrfs_sysfs_add_device(device); + struct btrfs_device *device; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); @@ -1456,7 +1453,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 1431dbc3c2ef..6296c4f6b330 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,8 +14,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +int btrfs_sysfs_add_device(struct btrfs_device *device); void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 13b394d401f9..2aa7fd89efbf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2608,7 +2608,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); + btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); -- cgit From 53f8a74cbeffd45a95de5dc6d16584aadb682a31 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:27 +0800 Subject: btrfs: split and refactor btrfs_sysfs_remove_devices_dir Similar to btrfs_sysfs_add_devices_dir()'s refactoring, split btrfs_sysfs_remove_devices_dir() so that we don't have to use the device argument to indicate whether to free all devices or just one device. Export btrfs_sysfs_remove_device() as device operations outside of sysfs.c now calls this instead of btrfs_sysfs_remove_devices_dir(). btrfs_sysfs_remove_devices_dir() is renamed to btrfs_sysfs_remove_fs_devices() to suite its new role. Now, no one outside of sysfs.c calls btrfs_sysfs_remove_fs_devices() so it is redeclared s static. And the same function had to be moved before its first caller. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 27 +++++++++++---------------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 17 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c176375c2c02..db4d4a53ab73 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -781,7 +781,7 @@ error: mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index bc341560dc69..7aed0712f183 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -972,6 +972,14 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; @@ -999,7 +1007,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -1186,7 +1194,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_sysfs_remove_device(struct btrfs_device *device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -1212,19 +1220,6 @@ static void btrfs_sysfs_remove_device(struct btrfs_device *device) } } -/* When @device is NULL, remove all devices link */ -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) -{ - if (device) { - btrfs_sysfs_remove_device(device); - return; - } - - list_for_each_entry(device, &fs_devices->devices, dev_list) - btrfs_sysfs_remove_device(device); -} - static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1459,7 +1454,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6296c4f6b330..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -15,8 +15,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device(struct btrfs_device *device); -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2aa7fd89efbf..79fe9822196f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2041,7 +2041,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2145,7 +2145,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2246,7 +2246,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2682,7 +2682,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); -- cgit From 1888709d71806c0bdf008ea471ce9569b8efa79b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:32 +0800 Subject: btrfs: remove tmp variable for list traversal in btrfs_init_dev_replace_tgtdev In the function btrfs_init_dev_replace_tgtdev(), the local variable devices is used only once, we can remove it. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db4d4a53ab73..500e45f7aa3f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -224,7 +224,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -244,8 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); -- cgit From 0725c0c9351d9854a61bc68f0a59d9f91d75abbd Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:36 +0800 Subject: btrfs: move btrfs_dev_replace_update_device_in_mapping_tree to drop declaration The function is short and simple, we can get rid of the declaration as it's not necessary for a static function. Move it before its first caller. No functional changes. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/dev-replace.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 500e45f7aa3f..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -628,6 +624,32 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, return ret; } +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -797,32 +819,6 @@ error: return 0; } -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; - - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); -} - /* * Read progress of device replace status according to the state and last * stored position. The value format is the same as for -- cgit