From c6761a9ed329dd41fee09dc2926126768e5ca34c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:32 +0800 Subject: btrfs: sysfs, btrfs_sysfs_add_fsid() drop unused argument parent Commit 24bd69cb ("Btrfs: sysfs: add support to add parent for fsid") added parent argument in preparation to show the seed fsid under the sprout fsid as in the patch [1] in the mailing list. [1] Btrfs: sysfs: support seed devices in the sysfs layout But later this idea was superseded by another idea to rename the fsid as in the commit f93c39970b1d ("btrfs: factor out sysfs code for updating sprout fsid"). So we don't need parent argument anymore. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0edfdc9c82b..639104326691 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3082,7 +3082,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_free_extra_devids(fs_devices, 1); - ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); -- cgit From be2cf92e0a2f64efe0709a2cfe5d9f4852d14b61 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:33 +0800 Subject: btrfs: sysfs, rename btrfs_sysfs_add_device() btrfs_sysfs_add_device() creates the directory /sys/fs/btrfs/UUID/devices but its function name is misleading. Rename it to btrfs_sysfs_add_devices_kobj() instead. No functional changes. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/sysfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639104326691..65cfd0fd0f83 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3089,7 +3089,7 @@ int __cold open_ctree(struct super_block *sb, goto fail_block_groups; } - ret = btrfs_sysfs_add_device(fs_devices); + ret = btrfs_sysfs_add_devices_kobj(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs device interface: %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1d8ee57da164..c76c38d74972 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -995,7 +995,7 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, return 0; } -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) +int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->devices_kobj) fs_devs->devices_kobj = kobject_create_and_add("devices", -- cgit From bc036bb33524dcb2525d74c336e9694d5d3c0047 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:34 +0800 Subject: btrfs: sysfs, merge btrfs_sysfs_add devices_kobj and fsid Merge btrfs_sysfs_add_fsid() and btrfs_sysfs_add_devices_kobj() functions as these two are small and they are called one after the other. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ------- fs/btrfs/sysfs.c | 21 +++++++++------------ fs/btrfs/sysfs.h | 1 - 3 files changed, 9 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65cfd0fd0f83..ab888d89d844 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3089,13 +3089,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_block_groups; } - ret = btrfs_sysfs_add_devices_kobj(fs_devices); - if (ret) { - btrfs_err(fs_info, "failed to init sysfs device interface: %d", - ret); - goto fail_fsdev_sysfs; - } - ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c76c38d74972..16379f491ca1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -995,18 +995,6 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, return 0; } -int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs) -{ - if (!fs_devs->devices_kobj) - fs_devs->devices_kobj = kobject_create_and_add("devices", - &fs_devs->fsid_kobj); - - if (!fs_devs->devices_kobj) - return -ENOMEM; - - return 0; -} - int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { @@ -1083,6 +1071,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) return error; } + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs device interface"); + kobject_put(&fs_devs->fsid_kobj); + return -ENOMEM; + } + return 0; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index a0454947f41a..3d27b39eaf94 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,7 +19,6 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); -int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, const u8 *fsid); -- cgit From 39b07b5d7072f8e9fd8cc2f840d3749f86699bbb Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:23 -0800 Subject: btrfs: drop create parameter to btrfs_get_extent() We only pass this as 1 from __extent_writepage_io(). The parameter basically means "pretend I didn't pass in a page". This is silly since we can simply not pass in the page. Get rid of the parameter from btrfs_get_extent(), and since it's used as a get_extent_t callback, remove it from get_extent_t and btree_get_extent(), neither of which need it. While we're here, let's document btrfs_get_extent(). Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 6 ++---- fs/btrfs/file.c | 17 ++++++++--------- fs/btrfs/inode.c | 41 ++++++++++++++++++++++++----------------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/tests/inode-tests.c | 44 +++++++++++++++++++++----------------------- 9 files changed, 64 insertions(+), 62 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f895fb490b75..e416ef6c9415 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2875,7 +2875,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ab888d89d844..881aba162e4e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -202,8 +202,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * that covers the entire device */ struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 76f123ebb292..8c2d6cf1ce59 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 00ddefcb54c8..bbfb102d65b8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3043,7 +3043,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3466,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, - end - cur + 1, 1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, + end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a8551a1f56e2..5d205bbaafdc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, - u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 76c68c70d3e2..a16da274c9aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, round_down(*start, fs_info->sectorsize), - round_up(*len, fs_info->sectorsize), 0); + round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); @@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode, inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, alloc_end - alloc_start, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode, INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - alloc_end - cur_offset, 0); + alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0695d64df05b..bf5d7ce358b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4504,7 +4504,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - block_end - cur_offset, 0); + block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -6283,18 +6283,27 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. +/** + * btrfs_get_extent - Lookup the first extent overlapping a range in a file. + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start + * + * This returns the first &struct extent_map which overlaps with the given + * range, reading it from the B-tree and caching it if necessary. Note that + * there may be more extents which overlap the given range after the returned + * extent_map. * - * This also copies inline extents directly into the page. + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6311,7 +6320,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - const bool new_inline = !page || create; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -6434,8 +6442,7 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, - new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, !page, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -6447,7 +6454,7 @@ next: size_t extent_offset; size_t copy_size; - if (new_inline) + if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -6530,7 +6537,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* @@ -7155,7 +7162,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -10166,7 +10173,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12ae31e1813e..1b1b6ff855aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1128,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 09ecf7dc7b08..24a8c714f56c 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, - sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, - 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; -- cgit From b0643e59cfa609c4b5f246f2b2c33b078f87e9d9 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 13 Dec 2019 16:22:14 -0800 Subject: btrfs: add the beginning of async discard, discard workqueue When discard is enabled, everytime a pinned extent is released back to the block_group's free space cache, a discard is issued for the extent. This is an overeager approach when it comes to discarding and helping the SSD maintain enough free space to prevent severe garbage collection situations. This adds the beginning of async discard. Instead of issuing a discard prior to returning it to the free space, it is just marked as untrimmed. The block_group is then added to a LRU which then feeds into a workqueue to issue discards at a much slower rate. Full discarding of unused block groups is still done and will be addressed in a future patch of the series. For now, we don't persist the discard state of extents and bitmaps. Therefore, our failure recovery mode will be to consider extents untrimmed. This lets us handle failure and unmounting as one in the same. On a number of Facebook webservers, I collected data every minute accounting the time we spent in btrfs_finish_extent_commit() (col. 1) and in btrfs_commit_transaction() (col. 2). btrfs_finish_extent_commit() is where we discard extents synchronously before returning them to the free space cache. discard=sync: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) --------------------------------------------------------------- Drive A | 434 | 1170 Drive B | 880 | 2330 Drive C | 2943 | 3920 Drive D | 4763 | 5701 discard=async: p99 total per minute p99 total per minute Drive | extent_commit() (ms) | commit_trans() (ms) -------------------------------------------------------------- Drive A | 134 | 956 Drive B | 64 | 1972 Drive C | 59 | 1032 Drive D | 62 | 1200 While it's not great that the stats are cumulative over 1m, all of these servers are running the same workload and and the delta between the two are substantial. We are spending significantly less time in btrfs_finish_extent_commit() which is responsible for discarding. Reviewed-by: Josef Bacik Signed-off-by: Dennis Zhou Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/block-group.c | 37 +++++- fs/btrfs/block-group.h | 9 ++ fs/btrfs/ctree.h | 21 ++++ fs/btrfs/discard.c | 273 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/discard.h | 23 ++++ fs/btrfs/disk-io.c | 13 ++- fs/btrfs/extent-tree.c | 4 + fs/btrfs/free-space-cache.c | 54 ++++++++- fs/btrfs/free-space-cache.h | 2 + fs/btrfs/super.c | 35 +++++- fs/btrfs/volumes.c | 8 ++ 12 files changed, 468 insertions(+), 13 deletions(-) create mode 100644 fs/btrfs/discard.c create mode 100644 fs/btrfs/discard.h (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 82200dbca5ac..9a0ff3384381 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o + block-rsv.o delalloc-space.o block-group.o discard.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index be1938dc94fd..6ba15c45e779 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -14,6 +14,7 @@ #include "sysfs.h" #include "tree-log.h" #include "delalloc-space.h" +#include "discard.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -131,6 +132,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + /* + * A block_group shouldn't be on the discard_list anymore. + * Remove the block_group from the discard_list to prevent us + * from causing a panic due to NULL pointer dereference. + */ + if (WARN_ON(!list_empty(&cache->discard_list))) + btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, + cache); + /* * If not empty, someone is still holding mutex of * full_stripe_lock, which can only be released by caller. @@ -466,8 +476,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end } else if (extent_start > start && extent_start < end) { size = extent_start - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); + ret = btrfs_add_free_space_async_trimmed(block_group, + start, size); BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { @@ -478,7 +488,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end if (start < end) { size = end - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, size); + ret = btrfs_add_free_space_async_trimmed(block_group, start, + size); BUG_ON(ret); /* -ENOMEM or logic error */ } @@ -1258,6 +1269,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + mutex_lock(&fs_info->delete_unused_bgs_mutex); /* Don't want to race with allocators so take the groups_sem */ @@ -1333,6 +1346,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_info->unused_bg_unpin_mutex); + /* + * At this point, the block_group is read only and should fail + * new allocations. However, btrfs_finish_extent_commit() can + * cause this block_group to be placed back on the discard + * lists because now the block_group isn't fully discarded. + * Bail here and try again later after discarding everything. + */ + spin_lock(&fs_info->discard_ctl.lock); + if (!list_empty(&block_group->discard_list)) { + spin_unlock(&fs_info->discard_ctl.lock); + btrfs_dec_block_group_ro(block_group); + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto end_trans; + } + spin_unlock(&fs_info->discard_ctl.lock); + /* Reset pinned so btrfs_put_block_group doesn't complain */ spin_lock(&space_info->lock); spin_lock(&block_group->lock); @@ -1603,6 +1633,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9b409676c4b2..884defd61dcd 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -116,7 +116,11 @@ struct btrfs_block_group { /* For read-only block groups */ struct list_head ro_list; + /* For discard operations */ atomic_t trimming; + struct list_head discard_list; + int discard_index; + u64 discard_eligible_time; /* For dirty block groups */ struct list_head dirty_list; @@ -158,6 +162,11 @@ struct btrfs_block_group { struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; +static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +{ + return (block_group->start + block_group->length); +} + #ifdef CONFIG_BTRFS_DEBUG static inline int btrfs_should_fragment_free_space( struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2f6c21ea84af..f7b429277089 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -440,6 +440,21 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. + */ +#define BTRFS_NR_DISCARD_LISTS 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; +}; + /* delayed seq elem */ struct seq_list { struct list_head list; @@ -526,6 +541,9 @@ enum { * so we don't need to offload checksums to workqueues. */ BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, }; struct btrfs_fs_info { @@ -816,6 +834,8 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_discard_ctl discard_ctl; + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif @@ -1189,6 +1209,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) +#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c new file mode 100644 index 000000000000..5924e757471b --- /dev/null +++ b/fs/btrfs/discard.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "block-group.h" +#include "discard.h" +#include "free-space-cache.h" + +/* This is an initial delay to give some chance for block reuse */ +#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) + +static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + return &discard_ctl->discard_list[block_group->discard_index]; +} + +static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) { + spin_unlock(&discard_ctl->lock); + return; + } + + if (list_empty(&block_group->discard_list)) + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_DELAY); + + list_move_tail(&block_group->discard_list, + get_discard_list(discard_ctl, block_group)); + + spin_unlock(&discard_ctl->lock); +} + +static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + bool running = false; + + spin_lock(&discard_ctl->lock); + + if (block_group == discard_ctl->block_group) { + running = true; + discard_ctl->block_group = NULL; + } + + block_group->discard_eligible_time = 0; + list_del_init(&block_group->discard_list); + + spin_unlock(&discard_ctl->lock); + + return running; +} + +/** + * find_next_block_group - find block_group that's up next for discarding + * @discard_ctl: discard control + * @now: current time + * + * Iterate over the discard lists to find the next block_group up for + * discarding checking the discard_eligible_time of block_group. + */ +static struct btrfs_block_group *find_next_block_group( + struct btrfs_discard_ctl *discard_ctl, + u64 now) +{ + struct btrfs_block_group *ret_block_group = NULL, *block_group; + int i; + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + struct list_head *discard_list = &discard_ctl->discard_list[i]; + + if (!list_empty(discard_list)) { + block_group = list_first_entry(discard_list, + struct btrfs_block_group, + discard_list); + + if (!ret_block_group) + ret_block_group = block_group; + + if (ret_block_group->discard_eligible_time < now) + break; + + if (ret_block_group->discard_eligible_time > + block_group->discard_eligible_time) + ret_block_group = block_group; + } + } + + return ret_block_group; +} + +/** + * peek_discard_list - wrap find_next_block_group() + * @discard_ctl: discard control + * + * This wraps find_next_block_group() and sets the block_group to be in use. + */ +static struct btrfs_block_group *peek_discard_list( + struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + + block_group = find_next_block_group(discard_ctl, now); + + if (block_group && now < block_group->discard_eligible_time) + block_group = NULL; + + discard_ctl->block_group = block_group; + + spin_unlock(&discard_ctl->lock); + + return block_group; +} + +/** + * btrfs_discard_cancel_work - remove a block_group from the discard lists + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This removes @block_group from the discard lists. If necessary, it waits on + * the current work and then reschedules the delayed work. + */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (remove_from_discard_list(discard_ctl, block_group)) { + cancel_delayed_work_sync(&discard_ctl->work); + btrfs_discard_schedule_work(discard_ctl, true); + } +} + +/** + * btrfs_discard_queue_work - handles queuing the block_groups + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This maintains the LRU order of the discard lists. + */ +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + add_to_discard_list(discard_ctl, block_group); + + if (!delayed_work_pending(&discard_ctl->work)) + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_discard_schedule_work - responsible for scheduling the discard work + * @discard_ctl: discard control + * @override: override the current timer + * + * Discards are issued by a delayed workqueue item. @override is used to + * update the current delay as the baseline delay interview is reevaluated + * on transaction commit. This is also maxed with any other rate limit. + */ +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) + goto out; + + if (!override && delayed_work_pending(&discard_ctl->work)) + goto out; + + block_group = find_next_block_group(discard_ctl, now); + if (block_group) { + u64 delay = 0; + + if (now < block_group->discard_eligible_time) + delay = nsecs_to_jiffies( + block_group->discard_eligible_time - now); + + mod_delayed_work(discard_ctl->discard_workers, + &discard_ctl->work, delay); + } +out: + spin_unlock(&discard_ctl->lock); +} + +/** + * btrfs_discard_workfn - discard work function + * @work: work + * + * This finds the next block_group to start discarding and then discards it. + */ +static void btrfs_discard_workfn(struct work_struct *work) +{ + struct btrfs_discard_ctl *discard_ctl; + struct btrfs_block_group *block_group; + u64 trimmed = 0; + + discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); + + block_group = peek_discard_list(discard_ctl); + if (!block_group || !btrfs_run_discard_work(discard_ctl)) + return; + + btrfs_trim_block_group(block_group, &trimmed, block_group->start, + btrfs_block_group_end(block_group), 0); + + remove_from_discard_list(discard_ctl, block_group); + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_run_discard_work - determines if async discard should be running + * @discard_ctl: discard control + * + * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + +void btrfs_discard_resume(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_cleanup(fs_info); + return; + } + + set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_stop(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_init(struct btrfs_fs_info *fs_info) +{ + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int i; + + spin_lock_init(&discard_ctl->lock); + INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) + INIT_LIST_HEAD(&discard_ctl->discard_list[i]); +} + +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) +{ + btrfs_discard_stop(fs_info); + cancel_delayed_work_sync(&fs_info->discard_ctl.work); +} diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h new file mode 100644 index 000000000000..f3775e84d35a --- /dev/null +++ b/fs/btrfs/discard.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_DISCARD_H +#define BTRFS_DISCARD_H + +struct btrfs_fs_info; +struct btrfs_discard_ctl; +struct btrfs_block_group; + +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override); +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); + +void btrfs_discard_resume(struct btrfs_fs_info *fs_info); +void btrfs_discard_stop(struct btrfs_fs_info *fs_info); +void btrfs_discard_init(struct btrfs_fs_info *fs_info); +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 881aba162e4e..5ce2801f8388 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ #include "tree-checker.h" #include "ref-verify.h" #include "block-group.h" +#include "discard.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -1953,6 +1954,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + if (fs_info->discard_ctl.discard_workers) + destroy_workqueue(fs_info->discard_ctl.discard_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2148,6 +2151,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + fs_info->discard_ctl.discard_workers = + alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2158,7 +2163,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers)) { + fs_info->qgroup_rescan_workers && + fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2792,6 +2798,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_init_dev_replace_locks(fs_info); btrfs_init_qgroup(fs_info); + btrfs_discard_init(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -3255,6 +3262,7 @@ int __cold open_ctree(struct super_block *sb, } btrfs_qgroup_rescan_resume(fs_info); + btrfs_discard_resume(fs_info); if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); @@ -3971,6 +3979,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); + if (!sb_rdonly(fs_info->sb)) { /* * The cleaner kthread is stopped, so do one final pass over diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1a8bf943c3e7..2c12366cfde5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,6 +32,7 @@ #include "block-rsv.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #undef SCRAMBLE_DELAYED_REFS @@ -2934,6 +2935,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) cond_resched(); } + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_schedule_work(&fs_info->discard_ctl, true); + /* * Transaction is finished. We don't need the lock anymore. We * do need to clean up the block groups in case of a transaction diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 063f4db44024..fdc5401f3877 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -755,9 +756,11 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, /* * Sync discard ensures that the free space cache is always * trimmed. So when reading this in, the state should reflect - * that. + * that. We also do this for async as a stop gap for lack of + * persistence. */ - if (btrfs_test_opt(fs_info, DISCARD_SYNC)) + if (btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC)) e->trim_state = BTRFS_TRIM_STATE_TRIMMED; if (!e->bytes) { @@ -2382,6 +2385,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { + struct btrfs_block_group *block_group = ctl->private; struct btrfs_free_space *info; int ret = 0; @@ -2431,6 +2435,9 @@ out: ASSERT(ret != -EEXIST); } + if (trim_state != BTRFS_TRIM_STATE_TRIMMED) + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + return ret; } @@ -2447,6 +2454,25 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, bytenr, size, trim_state); } +/* + * This is a subtle distinction because when adding free space back in general, + * we want it to be added as untrimmed for async. But in the case where we add + * it on loading of a block group, we want to consider it trimmed. + */ +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || + btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size, trim_state); +} + int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { @@ -3208,6 +3234,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, + enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; @@ -3215,6 +3242,9 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; + const u64 end = start + bytes; + const u64 reserved_end = reserved_start + reserved_bytes; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); @@ -3228,11 +3258,20 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); - if (!ret) + if (!ret) { *total_trimmed += trimmed; + trim_state = BTRFS_TRIM_STATE_TRIMMED; + } mutex_lock(&ctl->cache_writeout_mutex); - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + if (reserved_start < start) + __btrfs_add_free_space(fs_info, ctl, reserved_start, + start - reserved_start, + reserved_trim_state); + if (start + bytes < reserved_start + reserved_bytes) + __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + reserved_trim_state); + __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3259,6 +3298,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, int ret = 0; u64 extent_start; u64 extent_bytes; + enum btrfs_trim_state extent_trim_state; u64 bytes; while (start < end) { @@ -3300,6 +3340,7 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, extent_start = entry->offset; extent_bytes = entry->bytes; + extent_trim_state = entry->trim_state; start = max(start, extent_start); bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { @@ -3318,7 +3359,8 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes, &trim_entry); + extent_start, extent_bytes, extent_trim_state, + &trim_entry); if (ret) break; next: @@ -3442,7 +3484,7 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes, &trim_entry); + start, bytes, 0, &trim_entry); if (ret) { reset_trimming_bitmap(ctl, offset); break; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 5f25d94c9946..6a85a5d16343 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -113,6 +113,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 08ac6a7a67f0..a906315efd19 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -46,6 +46,7 @@ #include "sysfs.h" #include "tests/btrfs-tests.h" #include "block-group.h" +#include "discard.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -146,6 +147,8 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function if (sb_rdonly(sb)) return; + btrfs_discard_stop(fs_info); + /* btrfs handle error by forcing the filesystem readonly */ sb->s_flags |= SB_RDONLY; btrfs_info(fs_info, "forced readonly"); @@ -313,6 +316,7 @@ enum { Opt_datasum, Opt_nodatasum, Opt_defrag, Opt_nodefrag, Opt_discard, Opt_nodiscard, + Opt_discard_mode, Opt_nologreplay, Opt_norecovery, Opt_ratio, @@ -375,6 +379,7 @@ static const match_table_t tokens = { {Opt_defrag, "autodefrag"}, {Opt_nodefrag, "noautodefrag"}, {Opt_discard, "discard"}, + {Opt_discard_mode, "discard=%s"}, {Opt_nodiscard, "nodiscard"}, {Opt_nologreplay, "nologreplay"}, {Opt_norecovery, "norecovery"}, @@ -695,12 +700,26 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, info->metadata_ratio); break; case Opt_discard: - btrfs_set_and_info(info, DISCARD_SYNC, - "turning on sync discard"); + case Opt_discard_mode: + if (token == Opt_discard || + strcmp(args[0].from, "sync") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); + btrfs_set_and_info(info, DISCARD_SYNC, + "turning on sync discard"); + } else if (strcmp(args[0].from, "async") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); + btrfs_set_and_info(info, DISCARD_ASYNC, + "turning on async discard"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); + btrfs_clear_and_info(info, DISCARD_ASYNC, + "turning off async discard"); break; case Opt_space_cache: case Opt_space_cache_version: @@ -1324,6 +1343,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) seq_puts(seq, ",discard"); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) @@ -1713,6 +1734,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_cleanup_defrag_inodes(fs_info); } + /* If we toggled discard async */ + if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_resume(fs_info); + else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_cleanup(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } @@ -1760,6 +1789,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) */ cancel_work_sync(&fs_info->async_reclaim_work); + btrfs_discard_cleanup(fs_info); + /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al. */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86990b7a60ed..d184a994c392 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "space-info.h" #include "block-group.h" +#include "discard.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -2870,6 +2871,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; + struct btrfs_block_group *block_group; int ret; /* @@ -2893,6 +2895,12 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!block_group) + return -ENOENT; + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + btrfs_put_block_group(block_group); + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { -- cgit From 4e19443da1941050b346f8fc4c368aa68413bc88 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jan 2020 09:17:06 -0500 Subject: btrfs: free block groups after free'ing fs trees Sometimes when running generic/475 we would trip the WARN_ON(cache->reserved) check when free'ing the block groups on umount. This is because sometimes we don't commit the transaction because of IO errors and thus do not cleanup the tree logs until at umount time. These blocks are still reserved until they are cleaned up, but they aren't cleaned up until _after_ we do the free block groups work. Fix this by moving the free after free'ing the fs roots, that way all of the tree logs are cleaned up and we have a properly cleaned fs. A bunch of loops of generic/475 confirmed this fixes the problem. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5ce2801f8388..aea48d6ddc0c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4030,11 +4030,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); - btrfs_free_block_groups(fs_info); - clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + /* + * We must free the block groups after dropping the fs_roots as we could + * have had an IO error and have left over tree log blocks that aren't + * cleaned up until the fs roots are freed. This makes the block group + * accounting appear to be wrong because there's pending reserved bytes, + * so make sure we do the block group cleanup afterwards. + */ + btrfs_free_block_groups(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -- cgit From 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jan 2020 12:23:20 +0000 Subject: Btrfs: fix race between adding and putting tree mod seq elements and nodes There is a race between adding and removing elements to the tree mod log list and rbtree that can lead to use-after-free problems. Consider the following example that explains how/why the problems happens: 1) Task A has mod log element with sequence number 200. It currently is the only element in the mod log list; 2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to access the tree mod log. When it enters the function, it initializes 'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock' before checking if there are other elements in the mod seq list. Since the list it empty, 'min_seq' remains set to (u64)-1. Then it unlocks the lock 'tree_mod_seq_lock'; 3) Before task A acquires the lock 'tree_mod_log_lock', task B adds itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a sequence number of 201; 4) Some other task, name it task C, modifies a btree and because there elements in the mod seq list, it adds a tree mod elem to the tree mod log rbtree. That node added to the mod log rbtree is assigned a sequence number of 202; 5) Task B, which is doing fiemap and resolving indirect back references, calls btrfs get_old_root(), with 'time_seq' == 201, which in turn calls tree_mod_log_search() - the search returns the mod log node from the rbtree with sequence number 202, created by task C; 6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating the mod log rbtree and finds the node with sequence number 202. Since 202 is less than the previously computed 'min_seq', (u64)-1, it removes the node and frees it; 7) Task B still has a pointer to the node with sequence number 202, and it dereferences the pointer itself and through the call to __tree_mod_log_rewind(), resulting in a use-after-free problem. This issue can be triggered sporadically with the test case generic/561 from fstests, and it happens more frequently with a higher number of duperemove processes. When it happens to me, it either freezes the VM or it produces a trace like the following before crashing: [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1 [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 [ 1245.321287] RIP: 0010:rb_next+0x16/0x50 [ 1245.321307] Code: .... [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202 [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80 [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000 [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038 [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8 [ 1245.321539] FS: 00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000 [ 1245.321591] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0 [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1245.321706] Call Trace: [ 1245.321798] __tree_mod_log_rewind+0xbf/0x280 [btrfs] [ 1245.321841] btrfs_search_old_slot+0x105/0xd00 [btrfs] [ 1245.321877] resolve_indirect_refs+0x1eb/0xc60 [btrfs] [ 1245.321912] find_parent_nodes+0x3dc/0x11b0 [btrfs] [ 1245.321947] btrfs_check_shared+0x115/0x1c0 [btrfs] [ 1245.321980] ? extent_fiemap+0x59d/0x6d0 [btrfs] [ 1245.322029] extent_fiemap+0x59d/0x6d0 [btrfs] [ 1245.322066] do_vfs_ioctl+0x45a/0x750 [ 1245.322081] ksys_ioctl+0x70/0x80 [ 1245.322092] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 1245.322113] __x64_sys_ioctl+0x16/0x20 [ 1245.322126] do_syscall_64+0x5c/0x280 [ 1245.322139] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1245.322155] RIP: 0033:0x7fdee3942dd7 [ 1245.322177] Code: .... [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7 [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004 [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44 [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48 [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50 [ 1245.322423] Modules linked in: .... [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]--- Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum sequence number and iterates the rbtree while holding the lock 'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock' lock, since it is now redundant. Fixes: bd989ba359f2ac ("Btrfs: add tree modification log functions") Fixes: 097b8a7c9e48e2 ("Btrfs: join tree mod log code with the code holding back delayed refs") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++------ fs/btrfs/ctree.h | 6 ++---- fs/btrfs/delayed-ref.c | 8 ++++---- fs/btrfs/disk-io.c | 1 - fs/btrfs/tests/btrfs-tests.c | 1 - 5 files changed, 8 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 24658b5a5787..f2ec1a9bae28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { write_lock(&fs_info->tree_mod_log_lock); - spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; @@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f90b82050d2d..36df977b64d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -714,14 +714,12 @@ struct btrfs_fs_info { atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index df3bd880061d..dfdb7d4f8406 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); again: for (node = rb_first_cached(&head->ref_tree); node; @@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aea48d6ddc0c..7fa9bb79ad08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2697,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index c12b91ff5f56..84fb3fa940a6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -142,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->qgroup_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); -- cgit From e8294f2f6aa6208ed0923aa6d70cea3be178309a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:12:16 +0100 Subject: btrfs: print message when tree-log replay starts There's no logged information about tree-log replay although this is something that points to previous unclean unmount. Other filesystems report that as well. Suggested-by: Chris Murphy CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fa9bb79ad08..89422aa8e9d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3164,6 +3164,7 @@ int __cold open_ctree(struct super_block *sb, /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; -- cgit From 81f7eb00ff5bb8326e82503a32809421d14abb8a Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 11 Feb 2020 15:25:37 +0800 Subject: btrfs: destroy qgroup extent records on transaction abort We clean up the delayed references when we abort a transaction but we leave the pending qgroup extent records behind, leaking memory. This patch destroys the extent records when we destroy the delayed refs and makes sure ensure they're gone before releasing the transaction. Fixes: 3368d001ba5d ("btrfs: qgroup: Record possible quota-related extent for qgroup.") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Jeff Mahoney [ Rebased to latest upstream, remove to_qgroup() helper, use rbtree_postorder_for_each_entry_safe() wrapper ] Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 13 +++++++++++++ fs/btrfs/qgroup.h | 1 + fs/btrfs/transaction.c | 2 ++ 4 files changed, 17 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89422aa8e9d1..d7fec89974cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4276,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, cond_resched(); spin_lock(&delayed_refs->lock); } + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98d9a50352d6..ff1870ff3474 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4002,3 +4002,16 @@ out: } return ret; } + +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) +{ + struct btrfs_qgroup_extent_record *entry; + struct btrfs_qgroup_extent_record *next; + struct rb_root *root; + + root = &trans->delayed_refs.dirty_extent_root; + rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + ulist_free(entry->old_roots); + kfree(entry); + } +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 236f12224d52..1bc654459469 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, u64 last_snapshot); int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 33dcc88b428a..beb6c69cd1e5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.dirty_extent_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", -- cgit From 315bf8ef914f31d51d084af950703aa1e09a728c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:28 -0500 Subject: btrfs: reset fs_root to NULL on error in open_ctree While running my error injection script I hit a panic when we tried to clean up the fs_root when freeing the fs_root. This is because fs_info->fs_root == PTR_ERR(-EIO), which isn't great. Fix this by setting fs_info->fs_root = NULL; if we fail to read the root. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d7fec89974cb..197352f23534 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3200,6 +3200,7 @@ int __cold open_ctree(struct super_block *sb, if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); + fs_info->fs_root = NULL; goto fail_qgroup; } -- cgit From 1e90315149f3fe148e114a5de86f0196d1c21fa5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:29 -0500 Subject: btrfs: do not check delayed items are empty for single transaction cleanup btrfs_assert_delayed_root_empty() will check if the delayed root is completely empty, but this is a filesystem-wide check. On cleanup we may have allowed other transactions to begin, for whatever reason, and thus the delayed root is not empty. So remove this check from cleanup_one_transation(). This however can stay in btrfs_cleanup_transaction(), because it checks only after all of the transactions have been properly cleaned up, and thus is valid. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 197352f23534..c6c9a6a8e6c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4503,7 +4503,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, wake_up(&fs_info->transaction_wait); btrfs_destroy_delayed_inodes(fs_info); - btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); -- cgit