From 34d9700702f4042ce10d68a092ab7f79575e7a3b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 16 Mar 2016 16:43:06 +0800 Subject: btrfs: rename btrfs_std_error to btrfs_handle_fs_error btrfs_std_error() handles errors, puts FS into readonly mode (as of now). So its good idea to rename it to btrfs_handle_fs_error(). Signed-off-by: Anand Jain Reviewed-by: David Sterba [ edit changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..c0520a94a333 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9058,7 +9058,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } -- cgit From 6719afdcf808b38145095b2e485c0f90bfc29722 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 22 Jun 2014 14:30:09 +0200 Subject: Btrfs: Refactor btrfs_lock_cluster() to kill compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/btrfs/extent-tree.c: In function ‘btrfs_lock_cluster’: fs/btrfs/extent-tree.c:6399: warning: ‘used_bg’ may be used uninitialized in this function - Replace "again: ... goto again;" by standard C "while (1) { ... }", - Move block not processed during the first iteration of the loop to the end of the loop, which allows to kill the "locked" variable, Signed-off-by: Geert Uytterhoeven Reviewed-and-Tested-by: Miao Xie [ the compilation warning has been fixed by other patch, now we want to clean up the function ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0520a94a333..b66f460f779a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7025,36 +7025,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void -- cgit From 8eb0dfdbda3f56bf7d248ed87fcc383df114ecbb Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sun, 8 May 2016 15:08:00 +0200 Subject: btrfs: fix int32 overflow in shrink_delalloc(). UBSAN: Undefined behaviour in fs/btrfs/extent-tree.c:4623:21 signed integer overflow: 10808 * 262144 cannot be represented in type 'int [8]' If 8192<=items<16384, we request a writeback of an insane number of pages which is benign (everything will be written). But if items>=16384, the space reservation won't be enough. Signed-off-by: Adam Borowski Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..391f576789e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4620,7 +4620,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; -- cgit From 578def7c50f236432ba140d35bb7ca4ef0a1b20b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Apr 2016 15:36:38 +0100 Subject: Btrfs: don't wait for unrelated IO to finish before relocation Before the relocation process of a block group starts, it sets the block group to readonly mode, then flushes all delalloc writes and then finally it waits for all ordered extents to complete. This last step includes waiting for ordered extents destinated at extents allocated in other block groups, making us waste unecessary time. So improve this by waiting only for ordered extents that fall into the block group's range. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik Reviewed-by: Liu Bo --- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/extent-tree.c | 11 +++++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 26 +++++++++++++++++++------- fs/btrfs/ordered-data.h | 6 ++++-- fs/btrfs/relocation.c | 4 +++- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- 8 files changed, 38 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 26bcb487f958..3371f9e546d9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -403,7 +403,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -495,7 +495,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..251452a2b72c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4141,7 +4141,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4583,7 +4583,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4632,7 +4633,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } @@ -4671,7 +4673,8 @@ skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..697cc336bd1c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -681,7 +681,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..559170464d7c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,7 +718,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; @@ -728,7 +739,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); spin_lock(&fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..8ef12623d65c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ef890deca6..30f77ed60133 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4259,7 +4259,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) err = ret; goto out; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..89d134794d47 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1160,7 +1160,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..f0bb54a77314 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1821,7 +1821,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void -- cgit From 9cfa3e34e20e6798a671236000d9e97c8aa5d318 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Apr 2016 15:39:32 +0100 Subject: Btrfs: don't do unnecessary delalloc flushes when relocating Before we start the actual relocation process of a block group, we do calls to flush delalloc of all inodes and then wait for ordered extents to complete. However we do these flush calls just to make sure we don't race with concurrent tasks that have actually already started to run delalloc and have allocated an extent from the block group we want to relocate, right before we set it to readonly mode, but have not yet created the respective ordered extents. The flush calls make us wait for such concurrent tasks because they end up calling filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() -> __start_delalloc_inodes() -> btrfs_alloc_delalloc_work() -> btrfs_run_delalloc_work()) which ends up serializing us with those tasks due to attempts to lock the same pages (and the delalloc flush procedure calls the allocator and creates the ordered extents before unlocking the pages). These flushing calls not only make us waste time (cpu, IO) but also reduce the chances of writing larger extents (applications might be writing to contiguous ranges and we flush before they finish dirtying the whole ranges). So make sure we don't flush delalloc and just wait for concurrent tasks that have already started flushing delalloc and have allocated an extent from the block group we are about to relocate. This change also ends up fixing a race with direct IO writes that makes relocation not wait for direct IO ordered extents. This race is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) starts direct IO write, target inode currently has no ordered extents ongoing nor dirty pages (delalloc regions), therefore the root for our inode is not in the list fs_info->ordered_roots btrfs_direct_IO() __blockdev_direct_IO() btrfs_get_blocks_direct() btrfs_lock_extent_direct() locks range in the io tree btrfs_new_extent_direct() btrfs_reserve_extent() --> extent allocated from bg X btrfs_inc_block_group_ro(bg X) btrfs_start_delalloc_roots() __start_delalloc_inodes() --> does nothing, no dealloc ranges in the inode's io tree so the inode's root is not in the list fs_info->delalloc_roots btrfs_wait_ordered_roots() --> does not find the inode's root in the list fs_info->ordered_roots --> ends up not waiting for the direct IO write started by the task at CPU 2 relocate_block_group(rc->stage == MOVE_DATA_EXTENTS) prepare_to_relocate() btrfs_commit_transaction() iterates the extent tree, using its commit root and moves extents into new locations btrfs_add_ordered_extent_dio() --> now a ordered extent is created and added to the list root->ordered_extents and the root added to the list fs_info->ordered_roots --> this is too late and the task at CPU 1 already started the relocation btrfs_commit_transaction() btrfs_finish_ordered_io() btrfs_alloc_reserved_file_extent() --> adds delayed data reference for the extent allocated from bg X relocate_block_group(rc->stage == UPDATE_DATA_PTRS) prepare_to_relocate() btrfs_commit_transaction() --> delayed refs are run, so an extent item for the allocated extent from bg X is added to extent tree --> commit roots are switched, so the next scan in the extent tree will see the extent item sees the extent in the extent tree When this happens the relocation produces the following warning when it finishes: [ 7260.832836] ------------[ cut here ]------------ [ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]() [ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1 [ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000 [ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d [ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000 [ 7260.852998] Call Trace: [ 7260.852998] [] dump_stack+0x67/0x90 [ 7260.852998] [] warn_slowpath_common+0x99/0xb2 [ 7260.852998] [] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [] warn_slowpath_null+0x1a/0x1c [ 7260.852998] [] btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs] [ 7260.852998] [] btrfs_balance+0xde1/0xe4e [btrfs] [ 7260.852998] [] ? debug_smp_processor_id+0x17/0x19 [ 7260.852998] [] btrfs_ioctl_balance+0x255/0x2d3 [btrfs] [ 7260.852998] [] btrfs_ioctl+0x11e0/0x1dff [btrfs] [ 7260.852998] [] ? handle_mm_fault+0x443/0xd63 [ 7260.852998] [] ? _raw_spin_unlock+0x31/0x44 [ 7260.852998] [] ? arch_local_irq_save+0x9/0xc [ 7260.852998] [] vfs_ioctl+0x18/0x34 [ 7260.852998] [] do_vfs_ioctl+0x550/0x5be [ 7260.852998] [] ? __fget_light+0x4d/0x71 [ 7260.852998] [] SyS_ioctl+0x57/0x79 [ 7260.852998] [] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7260.893268] ---[ end trace eb7803b24ebab8ad ]--- This is because at the end of the first stage, in relocate_block_group(), we commit the current transaction, which makes delayed refs run, the commit roots are switched and so the second stage will find the extent item that the ordered extent added to the delayed refs. But this extent was not moved (ordered extent completed after first stage finished), so at the end of the relocation our block group item still has a positive used bytes counter, triggering a warning at the end of btrfs_relocate_block_group(). Later on when trying to read the extent contents from disk we hit a BUG_ON() due to the inability to map a block with a logical address that belongs to the block group we relocated and is no longer valid, resulting in the following trace: [ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096 [ 7344.887518] ------------[ cut here ]------------ [ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833! [ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1 [ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000 [ 7344.888431] RIP: 0010:[] [] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282 [ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001 [ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff [ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000 [ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000 [ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0 [ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000 [ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0 [ 7344.888431] Stack: [ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950 [ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000 [ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000 [ 7344.888431] Call Trace: [ 7344.888431] [] submit_extent_page+0xf5/0x16f [btrfs] [ 7344.888431] [] __do_readpage+0x4a0/0x4f1 [btrfs] [ 7344.888431] [] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] ? trace_hardirqs_on+0xd/0xf [ 7344.888431] [] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] __extent_readpages.constprop.25+0xed/0x100 [btrfs] [ 7344.888431] [] ? lru_cache_add+0xe/0x10 [ 7344.888431] [] extent_readpages+0x160/0x1aa [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] ? alloc_pages_current+0xa9/0xcd [ 7344.888431] [] btrfs_readpages+0x1f/0x21 [btrfs] [ 7344.888431] [] __do_page_cache_readahead+0x168/0x1fc [ 7344.888431] [] ondemand_readahead+0x1f6/0x207 [ 7344.888431] [] ? ondemand_readahead+0x1f6/0x207 [ 7344.888431] [] ? pagecache_get_page+0x2b/0x154 [ 7344.888431] [] page_cache_sync_readahead+0x3d/0x3f [ 7344.888431] [] generic_file_read_iter+0x197/0x4e1 [ 7344.888431] [] __vfs_read+0x79/0x9d [ 7344.888431] [] vfs_read+0x8f/0xd2 [ 7344.888431] [] SyS_read+0x50/0x7e [ 7344.888431] [] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0 [ 7344.888431] RIP [] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP [ 7344.970544] ---[ end trace eb7803b24ebab8ae ]--- Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik Reviewed-by: Liu Bo --- fs/btrfs/ctree.h | 14 ++++++++++++ fs/btrfs/extent-tree.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/inode.c | 8 +++++++ fs/btrfs/relocation.c | 6 +----- 4 files changed, 79 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..90e70e21e479 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1408,6 +1408,17 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -3499,6 +3510,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 251452a2b72c..09aad7b447f5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6175,6 +6175,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -7434,6 +7485,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7615,8 +7667,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2aaba58b4856..3991d8a74f61 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -824,6 +824,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1070,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -7162,6 +7167,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, return ERR_PTR(ret); } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) { @@ -9942,6 +9949,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 30f77ed60133..e78f8e44bd9a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4254,11 +4254,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } + btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_ordered_roots(fs_info, -1, rc->block_group->key.objectid, rc->block_group->key.offset); -- cgit From f78c436c3931e7df713688028f2b4faf72bf9f2a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 May 2016 13:15:41 +0100 Subject: Btrfs: fix race between block group relocation and nocow writes Relocation of a block group waits for all existing tasks flushing dellaloc, starting direct IO writes and any ordered extents before starting the relocation process. However for direct IO writes that end up doing nocow (inode either has the flag nodatacow set or the write is against a prealloc extent) we have a short time window that allows for a race that makes relocation proceed without waiting for the direct IO write to complete first, resulting in data loss after the relocation finishes. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) direct IO write starts against an extent in block group X using nocow mode (inode has the nodatacow flag or the write is for a prealloc extent) btrfs_direct_IO() btrfs_get_blocks_direct() --> can_nocow_extent() returns 1 btrfs_inc_block_group_ro(bg X) --> turns block group into RO mode btrfs_wait_ordered_roots() --> returns and does not know about the DIO write happening at CPU 2 (the task there has not created yet an ordered extent) relocate_block_group(bg X) --> rc->stage == MOVE_DATA_EXTENTS find_next_extent() --> returns extent that the DIO write is going to write to relocate_data_extent() relocate_file_extent_cluster() --> reads the extent from disk into pages belonging to the relocation inode and dirties them --> creates DIO ordered extent btrfs_submit_direct() --> submits bio against a location on disk obtained from an extent map before the relocation started btrfs_wait_ordered_range() --> writes all the pages read before to disk (belonging to the relocation inode) relocation finishes bio completes and wrote new data to the old location of the block group So fix this by tracking the number of nocow writers for a block group and make sure relocation waits for that number to go down to 0 before starting to move the extents. The same race can also happen with buffered writes in nocow mode since the patch I recently made titled "Btrfs: don't do unnecessary delalloc flushes when relocating", because we are no longer flushing all delalloc which served as a synchonization mechanism (due to page locking) and ensured the ordered extents for nocow buffered writes were created before we called btrfs_wait_ordered_roots(). The race with direct IO writes in nocow mode existed before that patch (no pages are locked or used during direct IO) and that fixed only races with direct IO writes that do cow. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/ctree.h | 13 +++++++++++++ fs/btrfs/extent-tree.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 15 +++++++++++++- fs/btrfs/relocation.c | 1 + 4 files changed, 81 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 90e70e21e479..7ae758685c7b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1419,6 +1419,16 @@ struct btrfs_block_group_cache { */ atomic_t reservations; + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -3513,6 +3523,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 09aad7b447f5..dcf89bfa990d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 45d0dafbbf40..ee9be4199e7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1382,6 +1382,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1396,6 +1399,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1416,6 +1422,9 @@ out_check: if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1458,6 +1467,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -7657,7 +7668,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { /* * Create the ordered extent before the extent map. This @@ -7672,6 +7684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (ret) { free_extent_map(em); goto unlock_err; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e78f8e44bd9a..054d9a80e77e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4255,6 +4255,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->block_group->key.objectid, rc->block_group->flags); btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, -1, rc->block_group->key.objectid, rc->block_group->key.offset); -- cgit