From a5d16333612718569ffd26064270e535cb9c3928 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: check if the to-be-added device is writable If we call ioctl(BTRFS_IOC_ADD_DEV) directly, we'll succeed in adding a readonly device to a btrfs filesystem, and btrfs will write to that device, emitting kernel errors: [ 3109.833692] lost page write due to I/O error on loop2 [ 3109.833720] lost page write due to I/O error on loop2 ... Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c37433d3cd82..0a8c8f8304b1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1611,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); -- cgit From 5dbc8fca8ef5d719014f22345d990e957dcfc692 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 9 Dec 2011 11:07:37 -0500 Subject: Btrfs: fix btrfs_end_bio to deal with write errors to a single mirror btrfs_end_bio checks the number of errors on a bio against the max number of errors allowed before sending any EIOs up to the higher levels. If we got enough copies of the bio done for a given raid level, it is supposed to clear the bio error flag and return success. We have pointers to the original bio sent down by the higher layers and pointers to any cloned bios we made for raid purposes. If the original bio happens to be the one that got an io error, but not the last one to finish, it might not have the BIO_UPTODATE bit set. Then, when the last bio does finish, we'll call bio_end_io on the original bio. It won't have the uptodate bit set and we'll end up sending EIO to the higher layers. We already had a check for this, it just was conditional on getting the IO error on the very last bio. Make the check unconditional so we eat the EIOs properly. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a8c8f8304b1..91ea57a1474a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3258,7 +3258,7 @@ static void btrfs_end_bio(struct bio *bio, int err) */ if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; - } else if (err) { + } else { /* * this bio is actually up to date, we didn't * go over the max number of errors -- cgit From d85c8a6f1bc083279215ff6e79b7c292bf3ec905 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 15 Dec 2011 15:38:41 -0500 Subject: Btrfs: unplug every once and a while The btrfs io submission threads can build up massive plug lists. This keeps things more reasonable so we don't hand over huge dumps of IO at once. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91ea57a1474a..f4b839fd3c9d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -295,6 +295,12 @@ loop_lock: btrfs_requeue_work(&device->work); goto done; } + /* unplug every 64 requests just for good measure */ + if (batch_run % 64 == 0) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } } cond_resched(); -- cgit From 21adbd5cbb5344a3fca6bb7ddb2ab6cb03c44546 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Wed, 9 Nov 2011 13:44:05 +0100 Subject: Btrfs: integrate integrity check module into btrfs This is the last part of the patch series. It modifies the btrfs code to use the integrity check module if configured to do so with the define BTRFS_FS_CHECK_INTEGRITY. If this define is not set, the only effective change is that code is added that handles the mount option to activate the integrity check. If the mount option is set and the define BTRFS_FS_CHECK_INTEGRITY is not set, that code complains in the log and the mount fails with EINVAL. Add the mount option to activate the usage of the integrity check code. Add invocation of btrfs integrity check code init and cleanup function on mount and umount, respectively. Add hook to call btrfs integrity check code version of submit_bh/submit_bio. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 8 +++++++- fs/btrfs/disk-io.c | 26 ++++++++++++++++++++++++-- fs/btrfs/extent_io.c | 5 +++-- fs/btrfs/scrub.c | 5 +++-- fs/btrfs/super.c | 39 ++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 7 ++++--- 6 files changed, 79 insertions(+), 11 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323d..39f6188688e6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -971,7 +971,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:20; + unsigned long mount_opt:21; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; @@ -1155,6 +1155,10 @@ struct btrfs_fs_info { int scrub_workers_refcnt; struct btrfs_workers scrub_workers; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* filesystem state */ u64 fs_state; @@ -1413,6 +1417,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 19) +#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 20) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e582..f363c6d9c3de 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "tree-log.h" #include "free-space-cache.h" #include "inode-map.h" +#include "check-integrity.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -2001,6 +2002,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + fs_info->check_integrity_print_mask = 0; +#endif sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2356,6 +2360,19 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + ret = btrfsic_mount(tree_root, fs_devices, + btrfs_test_opt(tree_root, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? + 1 : 0, + fs_info->check_integrity_print_mask); + if (ret) + printk(KERN_WARNING "btrfs: failed to initialize" + " integrity check module %s\n", sb->s_id); + } +#endif + /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { @@ -2634,7 +2651,7 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - ret = submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(WRITE_FUA, bh); if (ret) errors++; } @@ -2711,7 +2728,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = bio; bio_get(bio); - submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(WRITE_FLUSH, bio); return 0; } @@ -3057,6 +3074,11 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY)) + btrfsic_unmount(root, fs_info->fs_devices); +#endif + btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9dc09f4..246669296e02 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -18,6 +18,7 @@ #include "ctree.h" #include "btrfs_inode.h" #include "volumes.h" +#include "check-integrity.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, } bio->bi_bdev = dev->bdev; bio_add_page(bio, page, length, start-page_offset(page)); - submit_bio(WRITE_SYNC, bio); + btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90d3fc0..567e148caca2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -25,6 +25,7 @@ #include "transaction.h" #include "backref.h" #include "extent_io.h" +#include "check-integrity.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -732,7 +733,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; bio->bi_private = &complete; - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); /* this will also unplug the queue */ wait_for_completion(&complete); @@ -958,7 +959,7 @@ static int scrub_submit(struct scrub_dev *sdev) sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, sbio->bio); + btrfsic_submit_bio(READ, sbio->bio); return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 34a8b6112ea4..22a2015f1d7b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -165,7 +165,10 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_check_integrity_print_mask, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +203,9 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_check_integrity, "check_int"}, + {Opt_check_integrity_including_extent_data, "check_int_data"}, + {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, {Opt_err, NULL}, }; @@ -398,6 +404,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + case Opt_check_integrity_including_extent_data: + printk(KERN_INFO "btrfs: enabling check integrity" + " including extent data\n"); + btrfs_set_opt(info->mount_opt, + CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity: + printk(KERN_INFO "btrfs: enabling check integrity\n"); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); + break; + case Opt_check_integrity_print_mask: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->check_integrity_print_mask = intarg; + printk(KERN_INFO "btrfs:" + " check_integrity_print_mask 0x%x\n", + info->check_integrity_print_mask); + } + break; +#else + case Opt_check_integrity_including_extent_data: + case Opt_check_integrity: + case Opt_check_integrity_print_mask: + printk(KERN_ERR "btrfs: support for check_integrity*" + " not compiled in!\n"); + ret = -EINVAL; + goto out; +#endif case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..821334f6e3a1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -32,6 +32,7 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" +#include "check-integrity.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -246,7 +247,7 @@ loop_lock: sync_pending = 0; } - submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; if (need_resched()) @@ -3304,7 +3305,7 @@ static noinline int schedule_bio(struct btrfs_root *root, /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); bio_put(bio); return 0; } @@ -3399,7 +3400,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (async_submit) schedule_bio(root, dev, rw, bio); else - submit_bio(rw, bio); + btrfsic_submit_bio(rw, bio); } else { bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; bio->bi_sector = logical >> 9; -- cgit From 1100373f8aa69e377386499350496e3d8565605f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 6 Jan 2012 15:47:38 -0500 Subject: Btrfs: use bigger metadata chunks on bigger filesystems The 256MB chunk is a little small on a huge FS. This scales up the chunk size. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..ac00e3aa80a1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2441,7 +2441,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; + /* for larger filesystems, use larger metadata chunks */ + if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) + max_stripe_size = 1024 * 1024 * 1024; + else + max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 8 * 1024 * 1024; -- cgit From 10f6327b5d26660e06120ba17cff993cb616b1d2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Nov 2011 15:05:22 -0500 Subject: btrfs: fix a deadlock in btrfs_scan_one_device() pathname resolution under a global mutex, taken on some paths in ->mount() is a Bad Idea(tm) - think what happens if said pathname resolution triggers automount of some btrfs instance and walks into attempt to grab the same mutex. Deadlock - we are waiting for daemon to finish walking the path, daemon is waiting for us to release the mutex... Signed-off-by: Al Viro --- fs/btrfs/volumes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..96bb3c0a79b0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -706,8 +706,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, u64 devid; u64 transid; - mutex_lock(&uuid_mutex); - flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); @@ -716,6 +714,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error; } + mutex_lock(&uuid_mutex); ret = set_blocksize(bdev, 4096); if (ret) goto error_close; @@ -737,9 +736,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: + mutex_unlock(&uuid_mutex); blkdev_put(bdev, flags); error: - mutex_unlock(&uuid_mutex); return ret; } -- cgit From 125ccb0ae6806dbec31abf4a85448971df3b4e39 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 8 Dec 2011 15:07:24 +0800 Subject: Btrfs: don't pass a trans handle unnecessarily in volumes.c Some functions never use the transaction handle passed to them. Signed-off-by: Li Zefan --- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/volumes.c | 18 +++++++----------- fs/btrfs/volumes.h | 3 +-- 3 files changed, 9 insertions(+), 14 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8603ee4e3dfd..5b53479ce07b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7084,7 +7084,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * space to fit our block group in. */ if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(NULL, device, min_free, + ret = find_free_dev_extent(device, min_free, &dev_offset, NULL); if (!ret) dev_nr++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..73f673c8d8d8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -829,7 +829,6 @@ out: /* * find_free_dev_extent - find free space in the specified device - * @trans: transaction handler * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @start: store the start of the free space. @@ -848,8 +847,7 @@ out: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *len) { struct btrfs_key key; @@ -893,7 +891,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0) { @@ -1469,8 +1467,7 @@ error_undo: /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_root *root) { struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; @@ -1695,7 +1692,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(trans, root); + ret = btrfs_prepare_sprout(root); BUG_ON(ret); } @@ -2323,8 +2320,7 @@ done: return ret; } -static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { @@ -2496,7 +2492,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (total_avail == 0) continue; - ret = find_free_dev_extent(trans, device, + ret = find_free_dev_extent(device, max_stripe_size * dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) @@ -2687,7 +2683,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(ret); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + ret = btrfs_add_system_chunk(chunk_root, &key, chunk, item_size); BUG_ON(ret); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37f..c1701ec9d49f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -230,7 +230,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_root *dev_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, +int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); #endif -- cgit From de11cc12df17337979e0929d2831887432f236ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 12:55:47 +0800 Subject: Btrfs: don't pre-allocate btrfs bio We pre-allocate a btrfs bio with fixed size, and then may re-allocate memory if we find stripes are bigger than the fixed size. But this pre-allocation is not necessary. Also we don't have to calcuate the stripe number twice. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 67 +++++++++++++++++------------------------------------- 1 file changed, 21 insertions(+), 46 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73f673c8d8d8..540fdd25fb5e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2897,26 +2897,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; - int stripes_allocated = 8; - int stripes_required = 1; int stripe_index; int i; + int ret = 0; int num_stripes; int max_errors = 0; struct btrfs_bio *bbio = NULL; - if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) - stripes_allocated = 1; -again: - if (bbio_ret) { - bbio = kzalloc(btrfs_bio_size(stripes_allocated), - GFP_NOFS); - if (!bbio) - return -ENOMEM; - - atomic_set(&bbio->error, 0); - } - read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); @@ -2935,32 +2922,6 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our btrfs_bio struct is too small, back off and try again */ - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP)) { - stripes_required = map->num_stripes; - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripes_required = map->sub_stripes; - max_errors = 1; - } - } - if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { - stripes_required = map->num_stripes; - } - } - if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; - free_extent_map(em); - kfree(bbio); - goto again; - } stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride @@ -3055,6 +3016,13 @@ again: } BUG_ON(stripe_index >= map->num_stripes); + bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + atomic_set(&bbio->error, 0); + if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = @@ -3151,15 +3119,22 @@ again: stripe_index++; } } - if (bbio_ret) { - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + + if (rw & REQ_WRITE) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } } + + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; out: free_extent_map(em); - return 0; + return ret; } int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, -- cgit From ec9ef7a13be4dcce964c8503e8999087945e5b9e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Dec 2011 14:06:42 +0800 Subject: Btrfs: simplfy calculation of stripe length for discard operation For btrfs raid, while discarding a range of space, we'll need to know the start offset and length to discard for each device, and it's done in btrfs_map_block(). However the calculation is a bit complex for raid0 and raid10, so I reimplement it based on a fact that: dev1 dev2 dev3 (raid0) ----------------------------------- s0 s3 s6 s1 s4 s7 s2 s5 Each device has (total_stripes / nr_dev) stripes, or plus one. Signed-off-by: Li Zefan --- fs/btrfs/volumes.c | 95 ++++++++++++++++++------------------------------------ 1 file changed, 31 insertions(+), 64 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 540fdd25fb5e..563ef650850e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3024,80 +3024,47 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, atomic_set(&bbio->error, 0); if (rw & REQ_DISCARD) { + int factor = 0; + int sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + + if (map->type & + (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + stripes_per_dev = div_u64_rem(stripe_nr_end - + stripe_nr_orig, + factor, + &remaining_stripes); + } + for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; bbio->stripes[i].dev = map->stripes[stripe_index].dev; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - u64 stripes; - u32 last_stripe = 0; - int j; - - div_u64_rem(stripe_nr_end - 1, - map->num_stripes, - &last_stripe); - - for (j = 0; j < map->num_stripes; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - map->num_stripes, &test); - if (test == stripe_index) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, map->num_stripes); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i == 0) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - stripe_offset = 0; - } - if (stripe_index == last_stripe) - bbio->stripes[i].length -= - stripe_end_offset; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u64 stripes; - int j; - int factor = map->num_stripes / - map->sub_stripes; - u32 last_stripe = 0; - - div_u64_rem(stripe_nr_end - 1, - factor, &last_stripe); - last_stripe *= map->sub_stripes; - - for (j = 0; j < factor; j++) { - u32 test; - - div_u64_rem(stripe_nr_end - 1 - j, - factor, &test); - - if (test == - stripe_index / map->sub_stripes) - break; - } - stripes = stripe_nr_end - 1 - j; - do_div(stripes, factor); - bbio->stripes[i].length = map->stripe_len * - (stripes - stripe_nr + 1); - - if (i < map->sub_stripes) { - bbio->stripes[i].length -= - stripe_offset; - if (i == map->sub_stripes - 1) - stripe_offset = 0; - } - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - map->sub_stripes - 1)) { + if ((i / sub_stripes + 1) % + sub_stripes == remaining_stripes) bbio->stripes[i].length -= stripe_end_offset; - } + if (i == sub_stripes - 1) + stripe_offset = 0; } else bbio->stripes[i].length = *length; -- cgit From b367e47fb3a70f5d24ebd6faf7d42436d485fb2d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 11:38:24 +0800 Subject: Btrfs: fix possible deadlock when opening a seed device The correct lock order is uuid_mutex -> volume_mutex -> chunk_mutex, but when we mount a filesystem which has backing seed devices, we have this lock chain: open_ctree() lock(chunk_mutex); read_chunk_tree(); read_one_dev(); open_seed_devices(); lock(uuid_mutex); and then we hit a lockdep splat. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/volumes.c | 9 +++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e582..858ab347413e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2270,9 +2270,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_chunk_tree(chunk_root); - mutex_unlock(&fs_info->chunk_mutex); if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 563ef650850e..fbb493b28d5a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3506,7 +3506,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) struct btrfs_fs_devices *fs_devices; int ret; - mutex_lock(&uuid_mutex); + BUG_ON(!mutex_is_locked(&uuid_mutex)); fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { @@ -3544,7 +3544,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - mutex_unlock(&uuid_mutex); return ret; } @@ -3687,6 +3686,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) if (!path) return -ENOMEM; + mutex_lock(&uuid_mutex); + lock_chunks(root); + /* first we search for all of the device items, and then we * read in all of the chunk items. This way we can create chunk * mappings that reference all of the devices that are afound @@ -3737,6 +3739,9 @@ again: } ret = 0; error: + unlock_chunks(root); + mutex_unlock(&uuid_mutex); + btrfs_free_path(path); return ret; } -- cgit From 6fef8df1dcb9b586268caff66df1d71ce8610132 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: get rid of *_alloc_profile fields {data,metadata,system}_alloc_profile fields have been unused for a long time now. Get rid of them. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.c | 3 --- fs/btrfs/extent-tree.c | 10 ++++------ fs/btrfs/volumes.c | 6 ++---- 4 files changed, 6 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 67385033323d..f5434ad49b99 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1135,9 +1135,6 @@ struct btrfs_fs_info { u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; - u64 data_alloc_profile; - u64 metadata_alloc_profile; - u64 system_alloc_profile; unsigned data_chunk_allocations; unsigned metadata_ratio; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d5551e582..ce9d0fb3627d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2321,9 +2321,6 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; ret = btrfs_init_space_info(fs_info); if (ret) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8603ee4e3dfd..f0591fd66249 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3067,14 +3067,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits & - root->fs_info->data_alloc_profile; + flags |= root->fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits & - root->fs_info->system_alloc_profile; + flags |= root->fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits & - root->fs_info->metadata_alloc_profile; + flags |= root->fs_info->avail_metadata_alloc_bits; + return btrfs_reduce_alloc_profile(root, flags); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839fd3c9d..89096f6d6fb4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2752,8 +2752,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - (fs_info->metadata_alloc_profile & - fs_info->avail_metadata_alloc_bits); + fs_info->avail_metadata_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, @@ -2763,8 +2762,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - (fs_info->system_alloc_profile & - fs_info->avail_system_alloc_bits); + fs_info->avail_system_alloc_bits; alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, -- cgit From 52ba692972532f8d652080214b6599ece3dd51b9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: introduce masks for chunk type and profile Chunk's type and profile are encoded in u64 flags field. Introduce masks to easily access them. Also fix the type of BTRFS_BLOCK_GROUP_* constants, it should be ULL. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 26 +++++++++++++++++--------- fs/btrfs/extent-tree.c | 12 +++--------- fs/btrfs/volumes.c | 11 ++--------- 3 files changed, 22 insertions(+), 27 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f5434ad49b99..4370a56fe81a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -751,15 +751,23 @@ struct btrfs_csum_item { } __attribute__ ((__packed__)); /* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1 << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) -#define BTRFS_BLOCK_GROUP_DUP (1 << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_NR_RAID_TYPES 5 - +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_NR_RAID_TYPES 5 + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) struct btrfs_block_group_item { __le64 used; __le64 chunk_objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f0591fd66249..a8d8204188d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; - flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA; + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -2993,9 +2992,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); - found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | - BTRFS_BLOCK_GROUP_SYSTEM | - BTRFS_BLOCK_GROUP_METADATA); + found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; found->total_bytes = total_bytes; found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; @@ -3016,10 +3013,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { - u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP); + u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK; if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89096f6d6fb4..d5fdee56777f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2949,12 +2949,8 @@ again: } } if (rw & REQ_DISCARD) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) stripes_required = map->num_stripes; - } } if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { @@ -2978,10 +2974,7 @@ again: if (rw & REQ_DISCARD) *length = min_t(u64, em->len - offset, *length); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, map->stripe_len - stripe_offset); -- cgit From c9e9f97bdfb64d06e9520f8e4f37674ac21cc9bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: add basic restriper infrastructure Add basic restriper infrastructure: extended balancing ioctl and all related ioctl data structures, add data structure for tracking restriper's state to fs_info, etc. The semantics of the old balancing ioctl are fully preserved. Explicitly disallow any volume operations when balance is in progress. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 6 +++ fs/btrfs/disk-io.c | 4 ++ fs/btrfs/ioctl.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/ioctl.h | 43 +++++++++++++++++ fs/btrfs/volumes.c | 120 +++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.h | 14 +++++- 6 files changed, 281 insertions(+), 41 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f8f11e18b53..c4d98c8df5c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -934,6 +934,7 @@ struct btrfs_block_group_cache { struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_balance_control; struct btrfs_delayed_root; struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -1159,6 +1160,11 @@ struct btrfs_fs_info { u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; + /* restriper state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + struct btrfs_balance_control *balance_ctl; + unsigned data_chunk_allocations; unsigned metadata_ratio; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ce9d0fb3627d..190a1b24c902 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2002,6 +2002,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + fs_info->balance_ctl = NULL; + sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c7d5bb..d838d2cfb947 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1203,13 +1203,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { @@ -1226,7 +1234,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; - goto out_unlock; + goto out_free; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; @@ -1241,7 +1249,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; - goto out_unlock; + goto out_free; } } @@ -1250,7 +1258,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_unlock; + goto out_free; } new_size = old_size - new_size; } else if (mod > 0) { @@ -1259,11 +1267,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size < 256 * 1024 * 1024) { ret = -EINVAL; - goto out_unlock; + goto out_free; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_unlock; + goto out_free; } do_div(new_size, root->sectorsize); @@ -1276,7 +1284,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_unlock; + goto out_free; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); @@ -1284,9 +1292,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = btrfs_shrink_device(device, new_size); } -out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); +out_free: kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2052,14 +2061,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -2074,14 +2094,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + mutex_lock(&root->fs_info->volume_mutex); + if (root->fs_info->balance_ctl) { + printk(KERN_INFO "btrfs: balance in progress\n"); + ret = -EINVAL; + goto out; + } + vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); kfree(vol_args); +out: + mutex_unlock(&root->fs_info->volume_mutex); return ret; } @@ -3034,6 +3065,76 @@ out: return ret; } +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + bargs->flags = bctl->flags; + + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); + memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); + memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); +} + +static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + struct btrfs_balance_control *bctl; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (arg) { + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + goto out; + } + } else { + bargs = NULL; + } + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out_bargs; + } + + bctl->fs_info = fs_info; + if (arg) { + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + + bctl->flags = bargs->flags; + } + + ret = btrfs_balance(bctl, bargs); + /* + * bctl is freed in __cancel_balance + */ + if (arg) { + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + } + +out_bargs: + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3078,7 +3179,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3110,6 +3211,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_BALANCE_V2: + return btrfs_ioctl_balance(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae9915de8..c8b37d2c0d77 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,47 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* + * this is packed, because it should be exactly the same as its disk + * byte order counterpart (struct btrfs_disk_balance_args) + */ +struct btrfs_balance_args { + __u64 profiles; + __u64 usage; + __u64 devid; + __u64 pstart; + __u64 pend; + __u64 vstart; + __u64 vend; + + __u64 target; + + __u64 flags; + + __u64 unused[8]; +} __attribute__ ((__packed__)); + +/* report balance progress to userspace */ +struct btrfs_balance_progress { + __u64 expected; /* estimated # of chunks that will be + * relocated to fulfill the request */ + __u64 considered; /* # of chunks we have considered so far */ + __u64 completed; /* # of chunks relocated so far */ +}; + +struct btrfs_ioctl_balance_args { + __u64 flags; /* in/out */ + __u64 state; /* out */ + + struct btrfs_balance_args data; /* in/out */ + struct btrfs_balance_args meta; /* in/out */ + struct btrfs_balance_args sys; /* in/out */ + + struct btrfs_balance_progress stat; /* out */ + + __u64 unused[72]; /* pad to 1k */ +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -272,6 +313,8 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d5fdee56777f..9fc06e6bc51a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1282,7 +1282,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) bool clear_super = false; mutex_lock(&uuid_mutex); - mutex_lock(&root->fs_info->volume_mutex); all_avail = root->fs_info->avail_data_alloc_bits | root->fs_info->avail_system_alloc_bits | @@ -1452,7 +1451,6 @@ error_close: if (bdev) blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: - mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -1629,7 +1627,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; /* @@ -1757,8 +1754,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); BUG_ON(ret); } -out: - mutex_unlock(&root->fs_info->volume_mutex); + return ret; error: blkdev_put(bdev, FMODE_EXCL); @@ -1766,7 +1762,7 @@ error: mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } - goto out; + return ret; } static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, @@ -2077,6 +2073,35 @@ error: return ret; } +/* + * Should be called with both balance and volume mutexes held to + * serialize other volume operations (add_dev/rm_dev/resize) with + * restriper. Same goes for unset_balance_control. + */ +static void set_balance_control(struct btrfs_balance_control *bctl) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + + BUG_ON(fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = bctl; + spin_unlock(&fs_info->balance_lock); +} + +static void unset_balance_control(struct btrfs_fs_info *fs_info) +{ + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + BUG_ON(!fs_info->balance_ctl); + + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl = NULL; + spin_unlock(&fs_info->balance_lock); + + kfree(bctl); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2086,29 +2111,23 @@ static u64 div_factor(u64 num, int factor) return num; } -int btrfs_balance(struct btrfs_root *dev_root) +static int __btrfs_balance(struct btrfs_fs_info *fs_info) { - int ret; - struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_root *chunk_root = fs_info->chunk_root; + struct btrfs_root *dev_root = fs_info->dev_root; + struct list_head *devices; struct btrfs_device *device; u64 old_size; u64 size_to_free; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; - struct btrfs_trans_handle *trans; struct btrfs_key found_key; - - if (dev_root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dev_root->fs_info->volume_mutex); - dev_root = dev_root->fs_info->dev_root; + struct btrfs_trans_handle *trans; + int ret; + int enospc_errors = 0; /* step one make some room on all the devices */ + devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; size_to_free = div_factor(old_size, 1); @@ -2151,12 +2170,14 @@ int btrfs_balance(struct btrfs_root *dev_root) * failed */ if (ret == 0) - break; + BUG(); /* FIXME break ? */ ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) + if (ret) { + ret = 0; break; + } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2174,12 +2195,63 @@ int btrfs_balance(struct btrfs_root *dev_root) found_key.offset); if (ret && ret != -ENOSPC) goto error; + if (ret == -ENOSPC) + enospc_errors++; key.offset = found_key.offset - 1; } - ret = 0; + error: btrfs_free_path(path); - mutex_unlock(&dev_root->fs_info->volume_mutex); + if (enospc_errors) { + printk(KERN_INFO "btrfs: %d enospc errors during balance\n", + enospc_errors); + if (!ret) + ret = -ENOSPC; + } + + return ret; +} + +static void __cancel_balance(struct btrfs_fs_info *fs_info) +{ + unset_balance_control(fs_info); +} + +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +/* + * Should be called with both balance and volume mutexes held + */ +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs) +{ + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret; + + if (btrfs_fs_closing(fs_info)) { + ret = -EINVAL; + goto out; + } + + set_balance_control(bctl); + + mutex_unlock(&fs_info->balance_mutex); + + ret = __btrfs_balance(fs_info); + + mutex_lock(&fs_info->balance_mutex); + + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, bargs); + } + + __cancel_balance(fs_info); + + return ret; +out: + kfree(bctl); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d4f37f..882582385283 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,17 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +struct btrfs_balance_args; +struct btrfs_balance_control { + struct btrfs_fs_info *fs_info; + + struct btrfs_balance_args data; + struct btrfs_balance_args meta; + struct btrfs_balance_args sys; + + u64 flags; +}; + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); @@ -228,7 +239,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_balance_control *bctl, + struct btrfs_ioctl_balance_args *bargs); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit From f43ffb60fd94e98be02780944e182ade6653b916 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: add basic infrastructure for selective balancing This allows to have a separate set of filters for each chunk type (data,meta,sys). The code however is generic and switch on chunk type is only done once. This commit also adds a type filter: it allows to balance for example meta and system chunks w/o touching data ones. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 3 +++ fs/btrfs/volumes.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 11 ++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d838d2cfb947..29b3a94933f0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3116,6 +3116,9 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); bctl->flags = bargs->flags; + } else { + /* balance everything - no filters */ + bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } ret = btrfs_balance(bctl, bargs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9fc06e6bc51a..91bbf6e774c0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2102,6 +2102,30 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) kfree(bctl); } +static int should_balance_chunk(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 chunk_offset) +{ + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_args *bargs = NULL; + u64 chunk_type = btrfs_chunk_type(leaf, chunk); + + /* type filter */ + if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & + (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { + return 0; + } + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + bargs = &bctl->data; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + bargs = &bctl->sys; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + bargs = &bctl->meta; + + return 1; +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -2119,10 +2143,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + int slot; int ret; int enospc_errors = 0; @@ -2179,8 +2206,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) break; } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != key.objectid) break; @@ -2188,7 +2217,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) if (found_key.offset == 0) break; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + + ret = should_balance_chunk(chunk_root, leaf, chunk, + found_key.offset); btrfs_release_path(path); + if (!ret) + goto loop; + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, @@ -2197,6 +2233,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) goto error; if (ret == -ENOSPC) enospc_errors++; +loop: key.offset = found_key.offset - 1; } @@ -2227,6 +2264,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 allowed; int ret; if (btrfs_fs_closing(fs_info)) { @@ -2234,6 +2272,23 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } + /* + * In case of mixed groups both data and meta should be picked, + * and identical options should be given for both of them. + */ + allowed = btrfs_super_incompat_flags(fs_info->super_copy); + if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && + (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) { + if (!(bctl->flags & BTRFS_BALANCE_DATA) || + !(bctl->flags & BTRFS_BALANCE_METADATA) || + memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { + printk(KERN_ERR "btrfs: with mixed groups data and " + "metadata balance options must be the same\n"); + ret = -EINVAL; + goto out; + } + } + set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 882582385283..003e54216069 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -186,6 +186,17 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) +/* + * Restriper's general type filter + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + struct btrfs_balance_args; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; -- cgit From ed25e9b26f898d8d63ae4a836489f1923534143b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: profiles filter Select chunks based on a given profile mask. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 24 ++++++++++++++++++++++++ fs/btrfs/volumes.h | 4 ++++ 2 files changed, 28 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91bbf6e774c0..447bd422d867 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2102,6 +2102,24 @@ static void unset_balance_control(struct btrfs_fs_info *fs_info) kfree(bctl); } +/* + * Balance filters. Return 1 if chunk should be filtered out + * (should not be balanced). + */ +static int chunk_profiles_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->profiles & chunk_profile) + return 0; + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2123,6 +2141,12 @@ static int should_balance_chunk(struct btrfs_root *root, else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) bargs = &bctl->meta; + /* profiles filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && + chunk_profiles_filter(chunk_type, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 003e54216069..fb20d7740440 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -196,6 +196,10 @@ struct map_lookup { #define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ BTRFS_BALANCE_SYSTEM | \ BTRFS_BALANCE_METADATA) +/* + * Balance filters + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From 5ce5b3c0916ba3a2e34cf648b94044adc5ef9e76 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: usage filter Select chunks that are less than X percent full. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 36 ++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 37 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 447bd422d867..b858242374db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2120,6 +2120,36 @@ static int chunk_profiles_filter(u64 chunk_profile, return 1; } +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor <= 0) + return 0; + if (factor >= 100) + return num; + + num *= factor; + do_div(num, 100); + return num; +} + +static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; + u64 chunk_used, user_thresh; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (chunk_used < user_thresh) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2147,6 +2177,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* usage filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && + chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fb20d7740440..eee77fcf812a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -200,6 +200,7 @@ struct map_lookup { * Balance filters */ #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From 409d404b461afa9738619f249fd7f62a366b68c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:47 +0200 Subject: Btrfs: devid filter Relocate chunks which have at least one stripe located on a device with devid X. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 23 +++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 24 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b858242374db..9ff5cd09a256 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2150,6 +2150,23 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, return ret; } +static int chunk_devid_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + int i; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) + return 0; + } + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2183,6 +2200,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && + chunk_devid_filter(leaf, chunk, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index eee77fcf812a..7cfec03c29b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -201,6 +201,7 @@ struct map_lookup { */ #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From 94e60d5a5c4b98a32b1077dec88df09ada712376 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: devid subset filter Select chunks which have at least one byte of at least one stripe located on a device with devid X in a given [pstart,pend) physical address range. This filter only works when devid filter is turned on. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 47 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9ff5cd09a256..c60071a448a5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2167,6 +2167,46 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +/* [pstart, pend) */ +static int chunk_drange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + struct btrfs_stripe *stripe; + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + u64 stripe_offset; + u64 stripe_length; + int factor; + int i; + + if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) + return 0; + + if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; + factor = num_stripes / factor; + + for (i = 0; i < num_stripes; i++) { + stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) + continue; + + stripe_offset = btrfs_stripe_offset(leaf, stripe); + stripe_length = btrfs_chunk_length(leaf, chunk); + do_div(stripe_length, factor); + + if (stripe_offset < bargs->pend && + stripe_offset + stripe_length > bargs->pstart) + return 0; + } + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2206,6 +2246,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* drange filter, makes sense only with devid filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && + chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7cfec03c29b8..844b08e388f2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -202,6 +202,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From ea67176ae8c024f64d85ec33873e5eadf1af7247 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: virtual address space subset filter Select chunks which have at least one byte located inside a given [vstart, vend) virtual address space range. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 20 ++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 21 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c60071a448a5..e86c9e4fe51e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2207,6 +2207,20 @@ static int chunk_drange_filter(struct extent_buffer *leaf, return 1; } +/* [vstart, vend) */ +static int chunk_vrange_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + u64 chunk_offset, + struct btrfs_balance_args *bargs) +{ + if (chunk_offset < bargs->vend && + chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) + /* at least part of the chunk is inside this vrange */ + return 0; + + return 1; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2252,6 +2266,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* vrange filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && + chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 844b08e388f2..eac26c359312 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -203,6 +203,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From e4d8ec0f65b91bfb4984a4927632ded95f9825ad Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: implement online profile changing Profile changing is done by launching a balance with BTRFS_BALANCE_CONVERT bits set and target fields of respective btrfs_balance_args structs initialized. Profile reducing code in this case will pick restriper's target profile if it's available instead of doing a blind reduce. If target profile is not yet available it goes back to a plain reduce. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 5 ++++ 3 files changed, 129 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1a18ea7b6c6..e6a832e3e647 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3030,7 +3030,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) /* * @flags: available profiles in extended format (see ctree.h) * - * Returns reduced profile in chunk format. + * Returns reduced profile in chunk format. If profile changing is in + * progress (either running or paused) picks the target profile (if it's + * already available), otherwise falls back to plain reducing. */ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { @@ -3042,6 +3044,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; + /* pick restriper's target profile if it's available */ + spin_lock(&root->fs_info->balance_lock); + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + if ((flags & BTRFS_BLOCK_GROUP_DATA) && + (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->data.target)) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) && + (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->sys.target)) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) && + (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (flags & bctl->meta.target)) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + spin_unlock(&root->fs_info->balance_lock); + flags = tgt; + goto out; + } + } + spin_unlock(&root->fs_info->balance_lock); + if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); if (num_devices < 4) @@ -3065,6 +3095,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) flags &= ~BTRFS_BLOCK_GROUP_RAID0; } +out: /* extended -> chunk profile */ flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; @@ -6795,6 +6826,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + if (root->fs_info->balance_ctl) { + struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + u64 tgt = 0; + + /* pick restriper's target profile and return */ + if (flags & BTRFS_BLOCK_GROUP_DATA && + bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; + } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && + bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; + } else if (flags & BTRFS_BLOCK_GROUP_METADATA && + bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { + tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; + } + + if (tgt) { + /* extended -> chunk profile */ + tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; + return tgt; + } + } + /* * we add in the count of missing devices because we want * to make sure that any RAID levels on a degraded FS diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e86c9e4fe51e..f08210e83339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2438,6 +2438,75 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } + /* + * Profile changing sanity checks. Skip them if a simple + * balance is requested. + */ + if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) & + BTRFS_BALANCE_ARGS_CONVERT)) + goto do_balance; + + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + if (fs_info->fs_devices->num_devices == 1) + allowed |= BTRFS_BLOCK_GROUP_DUP; + else if (fs_info->fs_devices->num_devices < 4) + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); + else + allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10); + + if (!profile_is_valid(bctl->data.target, 1) || + bctl->data.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "data profile %llu\n", + (unsigned long long)bctl->data.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->meta.target, 1) || + bctl->meta.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "metadata profile %llu\n", + (unsigned long long)bctl->meta.target); + ret = -EINVAL; + goto out; + } + if (!profile_is_valid(bctl->sys.target, 1) || + bctl->sys.target & ~allowed) { + printk(KERN_ERR "btrfs: unable to start balance with target " + "system profile %llu\n", + (unsigned long long)bctl->sys.target); + ret = -EINVAL; + goto out; + } + + if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) { + printk(KERN_ERR "btrfs: dup for data is not allowed\n"); + ret = -EINVAL; + goto out; + } + + /* allow to reduce meta or sys integrity only if force set */ + allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } + } + +do_balance: set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index eac26c359312..79ee9c324562 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -196,6 +196,9 @@ struct map_lookup { #define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ BTRFS_BALANCE_SYSTEM | \ BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) + /* * Balance filters */ @@ -205,6 +208,8 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) + struct btrfs_balance_args; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; -- cgit From cfa4c961cc69ffb7bda450972320a25cbd413e19 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: soft profile changing mode (aka soft convert) When doing convert from one profile to another if soft mode is on restriper won't touch chunks that already have the profile we are converting to. This is useful if e.g. half of the FS was converted earlier. The soft mode switch is (like every other filter) per-type. This means that we can convert for example meta chunks the "hard" way while converting data chunks selectively with soft switch. Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 23 +++++++++++++++++++++++ fs/btrfs/volumes.h | 6 ++++++ 2 files changed, 29 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f08210e83339..98b4067017ff 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2221,6 +2221,23 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_soft_convert_filter(u64 chunk_profile, + struct btrfs_balance_args *bargs) +{ + if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) + return 0; + + chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (chunk_profile == 0) + chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + if (bargs->target & chunk_profile) + return 1; + + return 0; +} + static int should_balance_chunk(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) @@ -2272,6 +2289,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* soft profile changing mode */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && + chunk_soft_convert_filter(chunk_type, bargs)) { + return 0; + } + return 1; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 79ee9c324562..6c143c98017a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -208,7 +208,13 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ #define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) struct btrfs_balance_args; struct btrfs_balance_control { -- cgit From 0940ebf6b92ea10a6f30ae5ac3993a3b75745da6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: save balance parameters to disk Introduce a new btree objectid for storing balance item. The reason is to be able to resume restriper after a crash with the same parameters. Balance item has a very high objectid and goes into tree of tree roots. The key for the new item is as follows: [ BTRFS_BALANCE_OBJECTID ; BTRFS_BALANCE_ITEM_KEY ; 0 ] Older kernels simply ignore it so it's safe to mount with an older kernel and then go back to the newer one. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e7aea60da2b..9997a59e4f58 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -86,6 +86,9 @@ struct btrfs_ordered_sum; /* holds checksums of all the data extents */ #define BTRFS_CSUM_TREE_OBJECTID 7ULL +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -692,6 +695,54 @@ struct btrfs_root_ref { __le16 name_len; } __attribute__ ((__packed__)); +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* usage filter */ + __le64 usage; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + __le64 unused[8]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + #define BTRFS_FILE_EXTENT_INLINE 0 #define BTRFS_FILE_EXTENT_REG 1 #define BTRFS_FILE_EXTENT_PREALLOC 2 @@ -1409,6 +1460,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +#define BTRFS_BALANCE_ITEM_KEY 248 + /* * string items are for debugging. They just store a short string of * data in the FS @@ -2103,8 +2156,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); -/* struct btrfs_super_block */ +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} +static inline void +btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); +} + +static inline void +btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, + struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); +} + +/* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 98b4067017ff..4c60ca0ae90b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2073,6 +2073,97 @@ error: return ret; } +static int insert_balance_item(struct btrfs_root *root, + struct btrfs_balance_control *bctl) +{ + struct btrfs_trans_handle *trans; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); + btrfs_set_balance_data(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); + btrfs_set_balance_meta(leaf, item, &disk_bargs); + btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); + btrfs_set_balance_sys(leaf, item, &disk_bargs); + + btrfs_set_balance_flags(leaf, item, bctl->flags); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + +static int del_balance_item(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key key; + int ret, err; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + return ret; +} + /* * Should be called with both balance and volume mutexes held to * serialize other volume operations (add_dev/rm_dev/resize) with @@ -2423,7 +2514,11 @@ error: static void __cancel_balance(struct btrfs_fs_info *fs_info) { + int ret; + unset_balance_control(fs_info); + ret = del_balance_item(fs_info->tree_root); + BUG_ON(ret); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, @@ -2530,6 +2625,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } do_balance: + ret = insert_balance_item(fs_info->tree_root, bctl); + if (ret) + goto out; + set_balance_control(bctl); mutex_unlock(&fs_info->balance_mutex); -- cgit From 596410151ed71819b9e8a8018c6c9992796b256d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: recover balance on mount On mount, if balance item is found, resume balance in a separate kernel thread. Try to be smart to continue roughly where previous balance (or convert) was interrupted. For chunk types that were being converted to some profile we turn on soft convert, in case of a simple balance we turn on usage filter and relocate only less-than-90%-full chunks of that type. These are just heuristics but they help quite a bit, and can be improved in future. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 4 ++ fs/btrfs/volumes.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 2 + 3 files changed, 139 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 190a1b24c902..eb7a11ac5b73 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2427,6 +2427,10 @@ retry_root_backup: if (!err) err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + + if (!err) + err = btrfs_recover_balance(fs_info->tree_root); + if (err) { close_ctree(tree_root); return ERR_PTR(err); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c60ca0ae90b..17e565388de0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "compat.h" #include "ctree.h" @@ -2164,6 +2165,46 @@ out: return ret; } +/* + * This is a heuristic used to reduce the number of chunks balanced on + * resume after balance was interrupted. + */ +static void update_balance_args(struct btrfs_balance_control *bctl) +{ + /* + * Turn on soft mode for chunk types that were being converted. + */ + if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; + if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) + bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; + + /* + * Turn on usage filter if is not already used. The idea is + * that chunks that we have already balanced should be + * reasonably full. Don't do it for chunks that are being + * converted - that will keep us from relocating unconverted + * (albeit full) chunks. + */ + if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->data.usage = 90; + } + if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->sys.usage = 90; + } + if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { + bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; + bctl->meta.usage = 90; + } +} + /* * Should be called with both balance and volume mutexes held to * serialize other volume operations (add_dev/rm_dev/resize) with @@ -2626,10 +2667,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl, do_balance: ret = insert_balance_item(fs_info->tree_root, bctl); - if (ret) + if (ret && ret != -EEXIST) goto out; - set_balance_control(bctl); + if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { + BUG_ON(ret == -EEXIST); + set_balance_control(bctl); + } else { + BUG_ON(ret != -EEXIST); + spin_lock(&fs_info->balance_lock); + update_balance_args(bctl); + spin_unlock(&fs_info->balance_lock); + } mutex_unlock(&fs_info->balance_mutex); @@ -2646,7 +2695,89 @@ do_balance: return ret; out: + if (bctl->flags & BTRFS_BALANCE_RESUME) + __cancel_balance(fs_info); + else + kfree(bctl); + return ret; +} + +static int balance_kthread(void *data) +{ + struct btrfs_balance_control *bctl = + (struct btrfs_balance_control *)data; + struct btrfs_fs_info *fs_info = bctl->fs_info; + int ret; + + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + set_balance_control(bctl); + + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + return ret; +} + +int btrfs_recover_balance(struct btrfs_root *tree_root) +{ + struct task_struct *tsk; + struct btrfs_balance_control *bctl; + struct btrfs_balance_item *item; + struct btrfs_disk_balance_args disk_bargs; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + if (!bctl) { + ret = -ENOMEM; + goto out; + } + + key.objectid = BTRFS_BALANCE_OBJECTID; + key.type = BTRFS_BALANCE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) + goto out_bctl; + if (ret > 0) { /* ret = -ENOENT; */ + ret = 0; + goto out_bctl; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); + + bctl->fs_info = tree_root->fs_info; + bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; + + btrfs_balance_data(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); + btrfs_balance_meta(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); + btrfs_balance_sys(leaf, item, &disk_bargs); + btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + + tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); + if (IS_ERR(tsk)) + ret = PTR_ERR(tsk); + else + goto out; + +out_bctl: kfree(bctl); +out: + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6c143c98017a..cd25ea58ec35 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -198,6 +198,7 @@ struct map_lookup { BTRFS_BALANCE_METADATA) #define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) /* * Balance filters @@ -271,6 +272,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +int btrfs_recover_balance(struct btrfs_root *tree_root); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit From 9555c6c180600b40f6e86bd4dc53bf47e06ed663 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:48 +0200 Subject: Btrfs: add skip_balance mount option Since restriper kthread starts involuntarily on mount and can suck cpu and memory bandwidth add a mount option to forcefully skip it. The restriper in that case hangs around in paused state and can be resumed from userspace when it's convenient. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 1 + fs/btrfs/super.c | 11 +++++++++-- fs/btrfs/volumes.c | 10 +++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9997a59e4f58..99eb2bcd9aa7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1492,6 +1492,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_RECOVERY (1 << 18) +#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 34a8b6112ea4..063b521e3ded 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,8 +164,9 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, + Opt_no_space_cache, Opt_recovery, Opt_skip_balance, + Opt_err, }; static match_table_t tokens = { @@ -200,6 +201,7 @@ static match_table_t tokens = { {Opt_inode_cache, "inode_cache"}, {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, + {Opt_skip_balance, "skip_balance"}, {Opt_err, NULL}, }; @@ -398,6 +400,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto recovery"); btrfs_set_opt(info->mount_opt, RECOVERY); break; + case Opt_skip_balance: + btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -723,6 +728,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(root, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); + if (btrfs_test_opt(root, SKIP_BALANCE)) + seq_puts(seq, ",skip_balance"); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 17e565388de0..e0160607e6e2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2707,15 +2707,19 @@ static int balance_kthread(void *data) struct btrfs_balance_control *bctl = (struct btrfs_balance_control *)data; struct btrfs_fs_info *fs_info = bctl->fs_info; - int ret; + int ret = 0; mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); set_balance_control(bctl); - printk(KERN_INFO "btrfs: continuing balance\n"); - ret = btrfs_balance(bctl, NULL); + if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + printk(KERN_INFO "btrfs: force skipping balance\n"); + } else { + printk(KERN_INFO "btrfs: continuing balance\n"); + ret = btrfs_balance(bctl, NULL); + } mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); -- cgit From 837d5b6e46d1a4af5b6cc8f2fe83cb5de79a2961 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: allow for pausing restriper Implement an ioctl for pausing restriper. This pauses the relocation, but balance is still considered to be "in progress": balance item is not deleted, other volume operations cannot be started, etc. If paused in the middle of profile changing operation we will continue making allocations with the target profile. Add a hook to close_ctree() to pause restriper and free its data structures on unmount. (It's safe to unmount when restriper is in "paused" state, we will resume with the same parameters on the next mount) Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/ioctl.c | 28 +++++++++++++++++++++++++++- fs/btrfs/ioctl.h | 7 +++++++ fs/btrfs/volumes.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 1 + 6 files changed, 94 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99eb2bcd9aa7..1afda75d5414 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1214,7 +1214,10 @@ struct btrfs_fs_info { /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; + atomic_t balance_running; + atomic_t balance_pause_req; struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; unsigned data_chunk_allocations; unsigned metadata_ratio; @@ -2658,6 +2661,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { + kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb7a11ac5b73..8ce837407800 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2004,7 +2004,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2980,6 +2983,9 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* pause restriper - we want to resume on mount */ + btrfs_pause_balance(root->fs_info); + btrfs_scrub_cancel(root); /* wait for any defraggers to finish */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 29b3a94933f0..f572c53dda4f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3072,6 +3072,11 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, bargs->flags = bctl->flags; + if (atomic_read(&fs_info->balance_running)) + bargs->state |= BTRFS_BALANCE_STATE_RUNNING; + if (atomic_read(&fs_info->balance_pause_req)) + bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); @@ -3103,6 +3108,11 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) bargs = NULL; } + if (fs_info->balance_ctl) { + ret = -EINPROGRESS; + goto out_bargs; + } + bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { ret = -ENOMEM; @@ -3123,7 +3133,8 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance + * bctl is freed in __cancel_balance or in free_fs_info if + * restriper was paused all the way until unmount */ if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) @@ -3138,6 +3149,19 @@ out: return ret; } +static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case BTRFS_BALANCE_CTL_PAUSE: + return btrfs_pause_balance(root->fs_info); + } + + return -EINVAL; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3216,6 +3240,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_progress(root, argp); case BTRFS_IOC_BALANCE_V2: return btrfs_ioctl_balance(root, argp); + case BTRFS_IOC_BALANCE_CTL: + return btrfs_ioctl_balance_ctl(root, arg); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c8b37d2c0d77..e972e11a8d77 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -109,6 +109,9 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[124]; /* pad to 1k */ }; +/* balance control ioctl modes */ +#define BTRFS_BALANCE_CTL_PAUSE 1 + /* * this is packed, because it should be exactly the same as its disk * byte order counterpart (struct btrfs_disk_balance_args) @@ -137,6 +140,9 @@ struct btrfs_balance_progress { __u64 completed; /* # of chunks relocated so far */ }; +#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) +#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) + struct btrfs_ioctl_balance_args { __u64 flags; /* in/out */ __u64 state; /* out */ @@ -315,6 +321,7 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_fs_info_args) #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_args) +#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0160607e6e2..d32660ce753d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2492,6 +2492,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + if (atomic_read(&fs_info->balance_pause_req)) { + ret = -ECANCELED; + goto error; + } + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2553,6 +2558,11 @@ error: return ret; } +static inline int balance_need_close(struct btrfs_fs_info *fs_info) +{ + return atomic_read(&fs_info->balance_pause_req) == 0; +} + static void __cancel_balance(struct btrfs_fs_info *fs_info) { int ret; @@ -2575,7 +2585,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, u64 allowed; int ret; - if (btrfs_fs_closing(fs_info)) { + if (btrfs_fs_closing(fs_info) || + atomic_read(&fs_info->balance_pause_req)) { ret = -EINVAL; goto out; } @@ -2680,18 +2691,25 @@ do_balance: spin_unlock(&fs_info->balance_lock); } + atomic_inc(&fs_info->balance_running); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + atomic_dec(&fs_info->balance_running); if (bargs) { memset(bargs, 0, sizeof(*bargs)); update_ioctl_balance_args(fs_info, bargs); } - __cancel_balance(fs_info); + if ((ret && ret != -ECANCELED && ret != -ENOSPC) || + balance_need_close(fs_info)) { + __cancel_balance(fs_info); + } + + wake_up(&fs_info->balance_wait_q); return ret; out: @@ -2785,6 +2803,35 @@ out: return ret; } +int btrfs_pause_balance(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + if (atomic_read(&fs_info->balance_running)) { + atomic_inc(&fs_info->balance_pause_req); + mutex_unlock(&fs_info->balance_mutex); + + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + + mutex_lock(&fs_info->balance_mutex); + /* we are good with balance_ctl ripped off from under us */ + BUG_ON(atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_pause_req); + } else { + ret = -ENOTCONN; + } + + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cd25ea58ec35..80953afb12b9 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -273,6 +273,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_recover_balance(struct btrfs_root *tree_root); +int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit From a7e99c691af553fc15ac46a51f130b7c59a65f76 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: allow for canceling restriper Implement an ioctl for canceling restriper. Currently we wait until relocation of the current block group is finished, in future this can be done by triggering a commit. Balance item is deleted and no memory about the interrupted balance is kept. Signed-off-by: Ilya Dryomov --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 4 ++++ fs/btrfs/ioctl.h | 2 ++ fs/btrfs/volumes.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.h | 1 + 6 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1afda75d5414..dfc136cc07d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1216,6 +1216,7 @@ struct btrfs_fs_info { struct mutex balance_mutex; atomic_t balance_running; atomic_t balance_pause_req; + atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8ce837407800..c23b82d8ec08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2006,6 +2006,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->balance_mutex); atomic_set(&fs_info->balance_running, 0); atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f572c53dda4f..60852217ce9a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3076,6 +3076,8 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; + if (atomic_read(&fs_info->balance_cancel_req)) + bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); @@ -3157,6 +3159,8 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) switch (cmd) { case BTRFS_BALANCE_CTL_PAUSE: return btrfs_pause_balance(root->fs_info); + case BTRFS_BALANCE_CTL_CANCEL: + return btrfs_cancel_balance(root->fs_info); } return -EINVAL; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e972e11a8d77..cd19d10794b9 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -111,6 +111,7 @@ struct btrfs_ioctl_fs_info_args { /* balance control ioctl modes */ #define BTRFS_BALANCE_CTL_PAUSE 1 +#define BTRFS_BALANCE_CTL_CANCEL 2 /* * this is packed, because it should be exactly the same as its disk @@ -142,6 +143,7 @@ struct btrfs_balance_progress { #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) +#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) struct btrfs_ioctl_balance_args { __u64 flags; /* in/out */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d32660ce753d..c32667318ae4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2492,7 +2492,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - if (atomic_read(&fs_info->balance_pause_req)) { + if (atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; } @@ -2560,7 +2561,10 @@ error: static inline int balance_need_close(struct btrfs_fs_info *fs_info) { - return atomic_read(&fs_info->balance_pause_req) == 0; + /* cancel requested || normal exit path */ + return atomic_read(&fs_info->balance_cancel_req) || + (atomic_read(&fs_info->balance_pause_req) == 0 && + atomic_read(&fs_info->balance_cancel_req) == 0); } static void __cancel_balance(struct btrfs_fs_info *fs_info) @@ -2586,7 +2590,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int ret; if (btrfs_fs_closing(fs_info) || - atomic_read(&fs_info->balance_pause_req)) { + atomic_read(&fs_info->balance_pause_req) || + atomic_read(&fs_info->balance_cancel_req)) { ret = -EINVAL; goto out; } @@ -2832,6 +2837,42 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info) return ret; } +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) +{ + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + mutex_unlock(&fs_info->balance_mutex); + return -ENOTCONN; + } + + atomic_inc(&fs_info->balance_cancel_req); + /* + * if we are running just wait and return, balance item is + * deleted in btrfs_balance in this case + */ + if (atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + wait_event(fs_info->balance_wait_q, + atomic_read(&fs_info->balance_running) == 0); + mutex_lock(&fs_info->balance_mutex); + } else { + /* __cancel_balance needs volume_mutex */ + mutex_unlock(&fs_info->balance_mutex); + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl) + __cancel_balance(fs_info); + + mutex_unlock(&fs_info->volume_mutex); + } + + BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); + atomic_dec(&fs_info->balance_cancel_req); + mutex_unlock(&fs_info->balance_mutex); + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80953afb12b9..caa9abd218ef 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -274,6 +274,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); int btrfs_recover_balance(struct btrfs_root *tree_root); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- cgit From 19a39dce3b9bf0244d19a446718ad6f7605ff099 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Jan 2012 22:04:49 +0200 Subject: Btrfs: add balance progress reporting Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/ioctl.h | 2 ++ fs/btrfs/volumes.c | 39 +++++++++++++++++++++++++++++++++++---- fs/btrfs/volumes.h | 3 +++ 4 files changed, 84 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 85e546ffe3c7..1e7a9bac31ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3065,7 +3065,7 @@ out: return ret; } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -3082,6 +3082,14 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); + + if (lock) { + spin_lock(&fs_info->balance_lock); + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + spin_unlock(&fs_info->balance_lock); + } else { + memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); + } } static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) @@ -3181,6 +3189,39 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) return -EINVAL; } +static long btrfs_ioctl_balance_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ioctl_balance_args *bargs; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->balance_mutex); + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; + goto out; + } + + bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + if (!bargs) { + ret = -ENOMEM; + goto out; + } + + update_ioctl_balance_args(fs_info, 1, bargs); + + if (copy_to_user(arg, bargs, sizeof(*bargs))) + ret = -EFAULT; + + kfree(bargs); +out: + mutex_unlock(&fs_info->balance_mutex); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3261,6 +3302,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance(root, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(root, arg); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index cd19d10794b9..4f69028a68c4 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -324,6 +324,8 @@ struct btrfs_ioctl_logical_ino_args { #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_args) #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_args) #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32667318ae4..d73439b4d7da 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2441,6 +2441,7 @@ static u64 div_factor(u64 num, int factor) static int __btrfs_balance(struct btrfs_fs_info *fs_info) { + struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_root *dev_root = fs_info->dev_root; struct list_head *devices; @@ -2456,6 +2457,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int slot; int ret; int enospc_errors = 0; + bool counting = true; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2487,12 +2489,18 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = -ENOMEM; goto error; } + + /* zero out stat counters */ + spin_lock(&fs_info->balance_lock); + memset(&bctl->stat, 0, sizeof(bctl->stat)); + spin_unlock(&fs_info->balance_lock); +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - if (atomic_read(&fs_info->balance_pause_req) || + if ((!counting && atomic_read(&fs_info->balance_pause_req)) || atomic_read(&fs_info->balance_cancel_req)) { ret = -ECANCELED; goto error; @@ -2529,24 +2537,47 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + if (!counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.considered++; + spin_unlock(&fs_info->balance_lock); + } + ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); if (!ret) goto loop; + if (counting) { + spin_lock(&fs_info->balance_lock); + bctl->stat.expected++; + spin_unlock(&fs_info->balance_lock); + goto loop; + } + ret = btrfs_relocate_chunk(chunk_root, chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) goto error; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { enospc_errors++; + } else { + spin_lock(&fs_info->balance_lock); + bctl->stat.completed++; + spin_unlock(&fs_info->balance_lock); + } loop: key.offset = found_key.offset - 1; } + if (counting) { + btrfs_release_path(path); + counting = false; + goto again; + } error: btrfs_free_path(path); if (enospc_errors) { @@ -2576,7 +2607,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) BUG_ON(ret); } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); /* @@ -2706,7 +2737,7 @@ do_balance: if (bargs) { memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, bargs); + update_ioctl_balance_args(fs_info, 0, bargs); } if ((ret && ret != -ECANCELED && ret != -ENOSPC) || diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index caa9abd218ef..6faec9dd1f93 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -218,6 +218,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) struct btrfs_balance_args; +struct btrfs_balance_progress; struct btrfs_balance_control { struct btrfs_fs_info *fs_info; @@ -226,6 +227,8 @@ struct btrfs_balance_control { struct btrfs_balance_args sys; u64 flags; + + struct btrfs_balance_progress stat; }; int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, -- cgit From 96bdc7dc61fb1b1e8e858dafb13abee8482ba064 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 16 Jan 2012 08:13:11 -0500 Subject: Btrfs: use larger system chunks system chunks by default are very small. This makes them slightly larger and also fixes the conditional checks to make sure we don't allocate a billion of them at once. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 +++ fs/btrfs/volumes.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0ad5f0f895e..700879ed64cf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3384,6 +3384,9 @@ static int should_alloc_chunk(struct btrfs_root *root, /* 256MB or 2% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); + /* system chunks need a much small threshold */ + if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) + thresh = 32 * 1024 * 1024; if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 59e878f9fdcc..7ffdb154daec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3166,7 +3166,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 8 * 1024 * 1024; + max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; } else { printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", -- cgit From 8a3344269465b26761b74493cfbeeaa75d821614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Oct 2011 18:06:13 +0200 Subject: btrfs: silence warning in raid array setup Raid array setup code creates an extent buffer in an usual way. When the PAGE_CACHE_SIZE is > super block size, the extent pages are not marked up-to-date, which triggers a WARN_ON in the following write_extent_buffer call. Add an explicit up-to-date call to silence the warning. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ffdb154daec..d8f282b5baa9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4357,6 +4357,20 @@ int btrfs_read_sys_array(struct btrfs_root *root) return -ENOMEM; btrfs_set_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit From 285190d99fef696ec8b0041787387f013cb71d67 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 16:23:58 +0900 Subject: Btrfs: check return value of lookup_extent_mapping() correctly This patch corrects error checking of lookup_extent_mapping(). Signed-off-by: Tsutomu Itoh --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d2..d02c27cd14c7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b05d35a7c0f1..8d6f55fbd28e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3308,7 +3308,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8f282b5baa9..cd040bf3fd87 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1954,7 +1954,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; -- cgit From a6b0d5c8dbfd428717fc4db4c36757783f391c7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 20:53:43 -0500 Subject: Btrfs: make sure we update latest_bdev When we are setting up the mount, we close all the devices that were not actually part of the metadata we found. But, we don't make sure that one of those devices wasn't fs_devices->latest_bdev, which means we can do a use after free on the one we closed. This updates latest_bdev as it goes. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/volumes.c | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 58d0678dfcba..b801d29f3f10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2305,6 +2305,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices); + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd040bf3fd87..7eefdef8a14c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -459,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!latest_transid || + device->generation > latest_transid) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } if (device->bdev) { blkdev_put(device->bdev, device->mode); @@ -487,6 +498,10 @@ again: goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); return 0; } -- cgit