diff options
Diffstat (limited to 'fs/btrfs/dev-replace.c')
| -rw-r--r-- | fs/btrfs/dev-replace.c | 856 | 
1 files changed, 856 insertions, 0 deletions
| diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 000000000000..66dbc8dbddf7 --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, +				       int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( +						struct btrfs_fs_info *fs_info, +						struct btrfs_device *srcdev, +						struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, +					 char *srcdev_name, +					 struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_key key; +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct extent_buffer *eb; +	int slot; +	int ret = 0; +	struct btrfs_path *path = NULL; +	int item_size; +	struct btrfs_dev_replace_item *ptr; +	u64 src_devid; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	key.objectid = 0; +	key.type = BTRFS_DEV_REPLACE_KEY; +	key.offset = 0; +	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); +	if (ret) { +no_valid_dev_replace_entry_found: +		ret = 0; +		dev_replace->replace_state = +			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; +		dev_replace->cont_reading_from_srcdev_mode = +		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; +		dev_replace->replace_state = 0; +		dev_replace->time_started = 0; +		dev_replace->time_stopped = 0; +		atomic64_set(&dev_replace->num_write_errors, 0); +		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); +		dev_replace->cursor_left = 0; +		dev_replace->committed_cursor_left = 0; +		dev_replace->cursor_left_last_write_of_item = 0; +		dev_replace->cursor_right = 0; +		dev_replace->srcdev = NULL; +		dev_replace->tgtdev = NULL; +		dev_replace->is_valid = 0; +		dev_replace->item_needs_writeback = 0; +		goto out; +	} +	slot = path->slots[0]; +	eb = path->nodes[0]; +	item_size = btrfs_item_size_nr(eb, slot); +	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + +	if (item_size != sizeof(struct btrfs_dev_replace_item)) { +		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); +		goto no_valid_dev_replace_entry_found; +	} + +	src_devid = btrfs_dev_replace_src_devid(eb, ptr); +	dev_replace->cont_reading_from_srcdev_mode = +		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); +	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); +	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); +	dev_replace->time_stopped = +		btrfs_dev_replace_time_stopped(eb, ptr); +	atomic64_set(&dev_replace->num_write_errors, +		     btrfs_dev_replace_num_write_errors(eb, ptr)); +	atomic64_set(&dev_replace->num_uncorrectable_read_errors, +		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); +	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); +	dev_replace->committed_cursor_left = dev_replace->cursor_left; +	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; +	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); +	dev_replace->is_valid = 1; + +	dev_replace->item_needs_writeback = 0; +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		dev_replace->srcdev = NULL; +		dev_replace->tgtdev = NULL; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, +							NULL, NULL); +		dev_replace->tgtdev = btrfs_find_device(fs_info, +							BTRFS_DEV_REPLACE_DEVID, +							NULL, NULL); +		/* +		 * allow 'btrfs dev replace_cancel' if src/tgt device is +		 * missing +		 */ +		if (!dev_replace->srcdev && +		    !btrfs_test_opt(dev_root, DEGRADED)) { +			ret = -EIO; +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", +				(unsigned long long)src_devid); +		} +		if (!dev_replace->tgtdev && +		    !btrfs_test_opt(dev_root, DEGRADED)) { +			ret = -EIO; +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", +				(unsigned long long)BTRFS_DEV_REPLACE_DEVID); +		} +		if (dev_replace->tgtdev) { +			if (dev_replace->srcdev) { +				dev_replace->tgtdev->total_bytes = +					dev_replace->srcdev->total_bytes; +				dev_replace->tgtdev->disk_total_bytes = +					dev_replace->srcdev->disk_total_bytes; +				dev_replace->tgtdev->bytes_used = +					dev_replace->srcdev->bytes_used; +			} +			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; +			btrfs_init_dev_replace_tgtdev_for_resume(fs_info, +				dev_replace->tgtdev); +		} +		break; +	} + +out: +	if (path) +		btrfs_free_path(path); +	return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, +			  struct btrfs_fs_info *fs_info) +{ +	int ret; +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_path *path; +	struct btrfs_key key; +	struct extent_buffer *eb; +	struct btrfs_dev_replace_item *ptr; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	if (!dev_replace->is_valid || +	    !dev_replace->item_needs_writeback) { +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	} +	btrfs_dev_replace_unlock(dev_replace); + +	key.objectid = 0; +	key.type = BTRFS_DEV_REPLACE_KEY; +	key.offset = 0; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} +	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); +	if (ret < 0) { +		pr_warn("btrfs: error %d while searching for dev_replace item!\n", +			ret); +		goto out; +	} + +	if (ret == 0 && +	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { +		/* +		 * need to delete old one and insert a new one. +		 * Since no attempt is made to recover any old state, if the +		 * dev_replace state is 'running', the data on the target +		 * drive is lost. +		 * It would be possible to recover the state: just make sure +		 * that the beginning of the item is never changed and always +		 * contains all the essential information. Then read this +		 * minimal set of information and use it as a base for the +		 * new state. +		 */ +		ret = btrfs_del_item(trans, dev_root, path); +		if (ret != 0) { +			pr_warn("btrfs: delete too small dev_replace item failed %d!\n", +				ret); +			goto out; +		} +		ret = 1; +	} + +	if (ret == 1) { +		/* need to insert a new item */ +		btrfs_release_path(path); +		ret = btrfs_insert_empty_item(trans, dev_root, path, +					      &key, sizeof(*ptr)); +		if (ret < 0) { +			pr_warn("btrfs: insert dev_replace item failed %d!\n", +				ret); +			goto out; +		} +	} + +	eb = path->nodes[0]; +	ptr = btrfs_item_ptr(eb, path->slots[0], +			     struct btrfs_dev_replace_item); + +	btrfs_dev_replace_lock(dev_replace); +	if (dev_replace->srcdev) +		btrfs_set_dev_replace_src_devid(eb, ptr, +			dev_replace->srcdev->devid); +	else +		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); +	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, +		dev_replace->cont_reading_from_srcdev_mode); +	btrfs_set_dev_replace_replace_state(eb, ptr, +		dev_replace->replace_state); +	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); +	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); +	btrfs_set_dev_replace_num_write_errors(eb, ptr, +		atomic64_read(&dev_replace->num_write_errors)); +	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, +		atomic64_read(&dev_replace->num_uncorrectable_read_errors)); +	dev_replace->cursor_left_last_write_of_item = +		dev_replace->cursor_left; +	btrfs_set_dev_replace_cursor_left(eb, ptr, +		dev_replace->cursor_left_last_write_of_item); +	btrfs_set_dev_replace_cursor_right(eb, ptr, +		dev_replace->cursor_right); +	dev_replace->item_needs_writeback = 0; +	btrfs_dev_replace_unlock(dev_replace); + +	btrfs_mark_buffer_dirty(eb); + +out: +	btrfs_free_path(path); + +	return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	dev_replace->committed_cursor_left = +		dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ +	struct timespec t = CURRENT_TIME_SEC; + +	return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, +			    struct btrfs_ioctl_dev_replace_args *args) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int ret; +	struct btrfs_device *tgt_device = NULL; +	struct btrfs_device *src_device = NULL; + +	switch (args->start.cont_reading_from_srcdev_mode) { +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: +		break; +	default: +		return -EINVAL; +	} + +	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || +	    args->start.tgtdev_name[0] == '\0') +		return -EINVAL; + +	mutex_lock(&fs_info->volume_mutex); +	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, +					    &tgt_device); +	if (ret) { +		pr_err("btrfs: target device %s is invalid!\n", +		       args->start.tgtdev_name); +		mutex_unlock(&fs_info->volume_mutex); +		return -EINVAL; +	} + +	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, +					    args->start.srcdev_name, +					    &src_device); +	mutex_unlock(&fs_info->volume_mutex); +	if (ret) { +		ret = -EINVAL; +		goto leave_no_lock; +	} + +	if (tgt_device->total_bytes < src_device->total_bytes) { +		pr_err("btrfs: target device is smaller than source device!\n"); +		ret = -EINVAL; +		goto leave_no_lock; +	} + +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; +		goto leave; +	} + +	dev_replace->cont_reading_from_srcdev_mode = +		args->start.cont_reading_from_srcdev_mode; +	WARN_ON(!src_device); +	dev_replace->srcdev = src_device; +	WARN_ON(!tgt_device); +	dev_replace->tgtdev = tgt_device; + +	printk_in_rcu(KERN_INFO +		      "btrfs: dev_replace from %s (devid %llu) to %s) started\n", +		      src_device->missing ? "<missing disk>" : +		        rcu_str_deref(src_device->name), +		      src_device->devid, +		      rcu_str_deref(tgt_device->name)); + +	tgt_device->total_bytes = src_device->total_bytes; +	tgt_device->disk_total_bytes = src_device->disk_total_bytes; +	tgt_device->bytes_used = src_device->bytes_used; + +	/* +	 * from now on, the writes to the srcdev are all duplicated to +	 * go to the tgtdev as well (refer to btrfs_map_block()). +	 */ +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; +	dev_replace->time_started = btrfs_get_seconds_since_1970(); +	dev_replace->cursor_left = 0; +	dev_replace->committed_cursor_left = 0; +	dev_replace->cursor_left_last_write_of_item = 0; +	dev_replace->cursor_right = 0; +	dev_replace->is_valid = 1; +	dev_replace->item_needs_writeback = 1; +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +	btrfs_dev_replace_unlock(dev_replace); + +	btrfs_wait_ordered_extents(root, 0); + +	/* force writing the updated state information to disk */ +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		btrfs_dev_replace_lock(dev_replace); +		goto leave; +	} + +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); + +	/* the disk copy procedure reuses the scrub code */ +	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, +			      src_device->total_bytes, +			      &dev_replace->scrub_progress, 0, 1); + +	ret = btrfs_dev_replace_finishing(root->fs_info, ret); +	WARN_ON(ret); + +	return 0; + +leave: +	dev_replace->srcdev = NULL; +	dev_replace->tgtdev = NULL; +	btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: +	if (tgt_device) +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); +	return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, +				       int scrub_ret) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_device *tgt_device; +	struct btrfs_device *src_device; +	struct btrfs_root *root = fs_info->tree_root; +	u8 uuid_tmp[BTRFS_UUID_SIZE]; +	struct btrfs_trans_handle *trans; +	int ret = 0; + +	/* don't allow cancel or unmount to disturb the finishing procedure */ +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + +	btrfs_dev_replace_lock(dev_replace); +	/* was the operation canceled, or is it finished? */ +	if (dev_replace->replace_state != +	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { +		btrfs_dev_replace_unlock(dev_replace); +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return 0; +	} + +	tgt_device = dev_replace->tgtdev; +	src_device = dev_replace->srcdev; +	btrfs_dev_replace_unlock(dev_replace); + +	/* replace old device with new one in mapping tree */ +	if (!scrub_ret) +		btrfs_dev_replace_update_device_in_mapping_tree(fs_info, +								src_device, +								tgt_device); + +	/* +	 * flush all outstanding I/O and inode extent mappings before the +	 * copy operation is declared as being finished +	 */ +	btrfs_start_delalloc_inodes(root, 0); +	btrfs_wait_ordered_extents(root, 0); + +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return PTR_ERR(trans); +	} +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); + +	/* keep away write_all_supers() during the finishing procedure */ +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); +	btrfs_dev_replace_lock(dev_replace); +	dev_replace->replace_state = +		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED +			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; +	dev_replace->tgtdev = NULL; +	dev_replace->srcdev = NULL; +	dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +	dev_replace->item_needs_writeback = 1; + +	if (scrub_ret) { +		printk_in_rcu(KERN_ERR +			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", +			      src_device->missing ? "<missing disk>" : +			        rcu_str_deref(src_device->name), +			      src_device->devid, +			      rcu_str_deref(tgt_device->name), scrub_ret); +		btrfs_dev_replace_unlock(dev_replace); +		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +		if (tgt_device) +			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + +		return 0; +	} + +	printk_in_rcu(KERN_INFO +		      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", +		      src_device->missing ? "<missing disk>" : +		        rcu_str_deref(src_device->name), +		      src_device->devid, +		      rcu_str_deref(tgt_device->name)); +	tgt_device->is_tgtdev_for_dev_replace = 0; +	tgt_device->devid = src_device->devid; +	src_device->devid = BTRFS_DEV_REPLACE_DEVID; +	tgt_device->bytes_used = src_device->bytes_used; +	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); +	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); +	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); +	tgt_device->total_bytes = src_device->total_bytes; +	tgt_device->disk_total_bytes = src_device->disk_total_bytes; +	tgt_device->bytes_used = src_device->bytes_used; +	if (fs_info->sb->s_bdev == src_device->bdev) +		fs_info->sb->s_bdev = tgt_device->bdev; +	if (fs_info->fs_devices->latest_bdev == src_device->bdev) +		fs_info->fs_devices->latest_bdev = tgt_device->bdev; +	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + +	btrfs_rm_dev_replace_srcdev(fs_info, src_device); +	if (src_device->bdev) { +		/* zero out the old super */ +		btrfs_scratch_superblock(src_device); +	} +	/* +	 * this is again a consistent state where no dev_replace procedure +	 * is running, the target device is part of the filesystem, the +	 * source device is not part of the filesystem anymore and its 1st +	 * superblock is scratched out so that it is no longer marked to +	 * belong to this filesystem. +	 */ +	btrfs_dev_replace_unlock(dev_replace); +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + +	/* write back the superblocks */ +	trans = btrfs_start_transaction(root, 0); +	if (!IS_ERR(trans)) +		btrfs_commit_transaction(trans, root); + +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + +	return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( +						struct btrfs_fs_info *fs_info, +						struct btrfs_device *srcdev, +						struct btrfs_device *tgtdev) +{ +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; +	struct extent_map *em; +	struct map_lookup *map; +	u64 start = 0; +	int i; + +	write_lock(&em_tree->lock); +	do { +		em = lookup_extent_mapping(em_tree, start, (u64)-1); +		if (!em) +			break; +		map = (struct map_lookup *)em->bdev; +		for (i = 0; i < map->num_stripes; i++) +			if (srcdev == map->stripes[i].dev) +				map->stripes[i].dev = tgtdev; +		start = em->start + em->len; +		free_extent_map(em); +	} while (start); +	write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, +					 char *srcdev_name, +					 struct btrfs_device **device) +{ +	int ret; + +	if (srcdevid) { +		ret = 0; +		*device = btrfs_find_device(root->fs_info, srcdevid, NULL, +					    NULL); +		if (!*device) +			ret = -ENOENT; +	} else { +		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, +							   device); +	} +	return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, +			      struct btrfs_ioctl_dev_replace_args *args) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	/* even if !dev_replace_is_valid, the values are good enough for +	 * the replace_status ioctl */ +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +	args->status.replace_state = dev_replace->replace_state; +	args->status.time_started = dev_replace->time_started; +	args->status.time_stopped = dev_replace->time_stopped; +	args->status.num_write_errors = +		atomic64_read(&dev_replace->num_write_errors); +	args->status.num_uncorrectable_read_errors = +		atomic64_read(&dev_replace->num_uncorrectable_read_errors); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		args->status.progress_1000 = 0; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +		args->status.progress_1000 = 1000; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		args->status.progress_1000 = div64_u64(dev_replace->cursor_left, +			div64_u64(dev_replace->srcdev->total_bytes, 1000)); +		break; +	} +	btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, +			     struct btrfs_ioctl_dev_replace_args *args) +{ +	args->result = __btrfs_dev_replace_cancel(fs_info); +	return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_device *tgt_device = NULL; +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = fs_info->tree_root; +	u64 result; +	int ret; + +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; +		btrfs_dev_replace_unlock(dev_replace); +		goto leave; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +		tgt_device = dev_replace->tgtdev; +		dev_replace->tgtdev = NULL; +		dev_replace->srcdev = NULL; +		break; +	} +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; +	dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +	dev_replace->item_needs_writeback = 1; +	btrfs_dev_replace_unlock(dev_replace); +	btrfs_scrub_cancel(fs_info); + +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return PTR_ERR(trans); +	} +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); +	if (tgt_device) +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +	return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +		dev_replace->replace_state = +			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; +		dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +		dev_replace->item_needs_writeback = 1; +		pr_info("btrfs: suspending dev_replace for unmount\n"); +		break; +	} + +	btrfs_dev_replace_unlock(dev_replace); +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ +	struct task_struct *task; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		dev_replace->replace_state = +			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; +		break; +	} +	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { +		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" +			"btrfs: you may cancel the operation after 'mount -o degraded'\n"); +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	} +	btrfs_dev_replace_unlock(dev_replace); + +	WARN_ON(atomic_xchg( +		&fs_info->mutually_exclusive_operation_running, 1)); +	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); +	return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ +	struct btrfs_fs_info *fs_info = data; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_ioctl_dev_replace_args *status_args; +	u64 progress; + +	status_args = kzalloc(sizeof(*status_args), GFP_NOFS); +	if (status_args) { +		btrfs_dev_replace_status(fs_info, status_args); +		progress = status_args->status.progress_1000; +		kfree(status_args); +		do_div(progress, 10); +		printk_in_rcu(KERN_INFO +			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", +			      dev_replace->srcdev->missing ? "<missing disk>" : +				rcu_str_deref(dev_replace->srcdev->name), +			      dev_replace->srcdev->devid, +			      dev_replace->tgtdev ? +				rcu_str_deref(dev_replace->tgtdev->name) : +				"<missing target disk>", +			      (unsigned int)progress); +	} +	btrfs_dev_replace_continue_on_mount(fs_info); +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + +	return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int ret; + +	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, +			      dev_replace->committed_cursor_left, +			      dev_replace->srcdev->total_bytes, +			      &dev_replace->scrub_progress, 0, 1); +	ret = btrfs_dev_replace_finishing(fs_info, ret); +	WARN_ON(ret); +	return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ +	if (!dev_replace->is_valid) +		return 0; + +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		return 0; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		/* +		 * return true even if tgtdev is missing (this is +		 * something that can happen if the dev_replace +		 * procedure is suspended by an umount and then +		 * the tgtdev is missing (or "btrfs dev scan") was +		 * not called and the the filesystem is remounted +		 * in degraded state. This does not stop the +		 * dev_replace procedure. It needs to be canceled +		 * manually if the cancelation is wanted. +		 */ +		break; +	} +	return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ +	/* the beginning is just an optimization for the typical case */ +	if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: +		/* this is not a nested case where the same thread +		 * is trying to acqurire the same lock twice */ +		mutex_lock(&dev_replace->lock); +		mutex_lock(&dev_replace->lock_management_lock); +		dev_replace->lock_owner = current->pid; +		atomic_inc(&dev_replace->nesting_level); +		mutex_unlock(&dev_replace->lock_management_lock); +		return; +	} + +	mutex_lock(&dev_replace->lock_management_lock); +	if (atomic_read(&dev_replace->nesting_level) > 0 && +	    dev_replace->lock_owner == current->pid) { +		WARN_ON(!mutex_is_locked(&dev_replace->lock)); +		atomic_inc(&dev_replace->nesting_level); +		mutex_unlock(&dev_replace->lock_management_lock); +		return; +	} + +	mutex_unlock(&dev_replace->lock_management_lock); +	goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ +	WARN_ON(!mutex_is_locked(&dev_replace->lock)); +	mutex_lock(&dev_replace->lock_management_lock); +	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); +	WARN_ON(dev_replace->lock_owner != current->pid); +	atomic_dec(&dev_replace->nesting_level); +	if (atomic_read(&dev_replace->nesting_level) == 0) { +		dev_replace->lock_owner = 0; +		mutex_unlock(&dev_replace->lock_management_lock); +		mutex_unlock(&dev_replace->lock); +	} else { +		mutex_unlock(&dev_replace->lock_management_lock); +	} +} |