diff options
Diffstat (limited to 'fs/btrfs')
42 files changed, 5271 insertions, 1763 deletions
| diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d7fcdba141a2..7df3e0f0ee51 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \  	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \  	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \  	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ -	   reada.o backref.o ulist.o qgroup.o send.o +	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0c16e3dbfd56..e15d2b0d8d3b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,  			ret = posix_acl_equiv_mode(acl, &inode->i_mode);  			if (ret < 0)  				return ret; +			if (ret == 0) +				acl = NULL;  		}  		ret = 0;  		break; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 208d8aa5b07e..04edf69be875 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)  		     pos2 = n2, n2 = pos2->next) {  			struct __prelim_ref *ref2;  			struct __prelim_ref *xchg; +			struct extent_inode_elem *eie;  			ref2 = list_entry(pos2, struct __prelim_ref, list); @@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)  					ref1 = ref2;  					ref2 = xchg;  				} -				ref1->count += ref2->count;  			} else {  				if (ref1->parent != ref2->parent)  					continue; -				ref1->count += ref2->count;  			} + +			eie = ref1->inode_list; +			while (eie && eie->next) +				eie = eie->next; +			if (eie) +				eie->next = ref2->inode_list; +			else +				ref1->inode_list = ref2->inode_list; +			ref1->count += ref2->count; +  			list_del(&ref2->list);  			kfree(ref2);  		} @@ -890,8 +899,7 @@ again:  	while (!list_empty(&prefs)) {  		ref = list_first_entry(&prefs, struct __prelim_ref, list);  		list_del(&ref->list); -		if (ref->count < 0) -			WARN_ON(1); +		WARN_ON(ref->count < 0);  		if (ref->count && ref->root_id && ref->parent == 0) {  			/* no parent == root of tree */  			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ed8ca7ca5eff..2a8c242bc4f5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -39,6 +39,7 @@  #define BTRFS_INODE_HAS_ORPHAN_ITEM		5  #define BTRFS_INODE_HAS_ASYNC_EXTENT		6  #define BTRFS_INODE_NEEDS_FULL_SYNC		7 +#define BTRFS_INODE_COPY_EVERYTHING		8  /* in memory btrfs inode */  struct btrfs_inode { @@ -90,6 +91,9 @@ struct btrfs_inode {  	unsigned long runtime_flags; +	/* Keep track of who's O_SYNC/fsycing currently */ +	atomic_t sync_writers; +  	/* full 64 bit generation number, struct vfs_inode doesn't have a big  	 * enough field for this.  	 */ diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 5a3e45db642a..11d47bfb62b4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -137,7 +137,7 @@ struct btrfsic_block {  	unsigned int never_written:1;	/* block was added because it was  					 * referenced, not because it was  					 * written */ -	unsigned int mirror_num:2;	/* large enough to hold +	unsigned int mirror_num;	/* large enough to hold  					 * BTRFS_SUPER_MIRROR_MAX */  	struct btrfsic_dev_state *dev_state;  	u64 dev_bytenr;		/* key, physical byte num on disk */ @@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,  		}  		num_copies = -		    btrfs_num_copies(&state->root->fs_info->mapping_tree, +		    btrfs_num_copies(state->root->fs_info,  				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(  		}  		num_copies = -		    btrfs_num_copies(&state->root->fs_info->mapping_tree, +		    btrfs_num_copies(state->root->fs_info,  				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(  	*next_blockp = NULL;  	if (0 == *num_copiesp) {  		*num_copiesp = -		    btrfs_num_copies(&state->root->fs_info->mapping_tree, +		    btrfs_num_copies(state->root->fs_info,  				     next_bytenr, state->metablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(  			chunk_len = num_bytes;  		num_copies = -		    btrfs_num_copies(&state->root->fs_info->mapping_tree, +		    btrfs_num_copies(state->root->fs_info,  				     next_bytenr, state->datablock_size);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,  	struct btrfs_device *device;  	length = len; -	ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, +	ret = btrfs_map_block(state->root->fs_info, READ,  			      bytenr, &length, &multi, mirror_num); +	if (ret) { +		block_ctx_out->start = 0; +		block_ctx_out->dev_bytenr = 0; +		block_ctx_out->len = 0; +		block_ctx_out->dev = NULL; +		block_ctx_out->datav = NULL; +		block_ctx_out->pagev = NULL; +		block_ctx_out->mem_to_free = NULL; + +		return ret; +	} +  	device = multi->stripes[0].dev;  	block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);  	block_ctx_out->dev_bytenr = multi->stripes[0].physical; @@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,  	block_ctx_out->pagev = NULL;  	block_ctx_out->mem_to_free = NULL; -	if (0 == ret) -		kfree(multi); +	kfree(multi);  	if (NULL == block_ctx_out->dev) {  		ret = -ENXIO;  		printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); @@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(  		}  		num_copies = -		    btrfs_num_copies(&state->root->fs_info->mapping_tree, +		    btrfs_num_copies(state->root->fs_info,  				     next_bytenr, BTRFS_SUPER_INFO_SIZE);  		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)  			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", @@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,  	struct btrfsic_block_data_ctx block_ctx;  	int match = 0; -	num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, +	num_copies = btrfs_num_copies(state->root->fs_info,  				      bytenr, state->metablock_size);  	for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6467aa88bee..94ab2f80e7e3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  			ret = btrfs_map_bio(root, READ, comp_bio,  					    mirror_num, 0); -			BUG_ON(ret); /* -ENOMEM */ +			if (ret) +				bio_endio(comp_bio, ret);  			bio_put(comp_bio); @@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  	}  	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); -	BUG_ON(ret); /* -ENOMEM */ +	if (ret) +		bio_endio(comp_bio, ret);  	bio_put(comp_bio);  	return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdfb4c49a806..c7b67cf24bba 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,  			      struct extent_buffer *dst_buf,  			      struct extent_buffer *src_buf);  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct btrfs_path *path, int level, int slot, -		    int tree_mod_log); +		    struct btrfs_path *path, int level, int slot);  static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,  				 struct extent_buffer *eb);  struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, @@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,  static noinline void  tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, -			  struct extent_buffer *eb, -			  struct btrfs_disk_key *disk_key, int slot, int atomic) +			  struct extent_buffer *eb, int slot, int atomic)  {  	int ret; @@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,  		switch (tm->op) {  		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:  			BUG_ON(tm->slot < n); -		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:  		case MOD_LOG_KEY_REMOVE: +			n++; +		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:  			btrfs_set_node_key(eb, &tm->key, tm->slot);  			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);  			btrfs_set_node_ptr_generation(eb, tm->slot,  						      tm->generation); -			n++;  			break;  		case MOD_LOG_KEY_REPLACE:  			BUG_ON(tm->slot >= n); @@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,  	u64 search_start;  	int ret; -	if (trans->transaction != root->fs_info->running_transaction) { -		printk(KERN_CRIT "trans %llu running %llu\n", +	if (trans->transaction != root->fs_info->running_transaction) +		WARN(1, KERN_CRIT "trans %llu running %llu\n",  		       (unsigned long long)trans->transid,  		       (unsigned long long)  		       root->fs_info->running_transaction->transid); -		WARN_ON(1); -	} -	if (trans->transid != root->fs_info->generation) { -		printk(KERN_CRIT "trans %llu running %llu\n", + +	if (trans->transid != root->fs_info->generation) +		WARN(1, KERN_CRIT "trans %llu running %llu\n",  		       (unsigned long long)trans->transid,  		       (unsigned long long)root->fs_info->generation); -		WARN_ON(1); -	}  	if (!should_cow_block(trans, root, buf)) {  		*cow_ret = buf; @@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,  	if (cache_only && parent_level != 1)  		return 0; -	if (trans->transaction != root->fs_info->running_transaction) -		WARN_ON(1); -	if (trans->transid != root->fs_info->generation) -		WARN_ON(1); +	WARN_ON(trans->transaction != root->fs_info->running_transaction); +	WARN_ON(trans->transid != root->fs_info->generation);  	parent_nritems = btrfs_header_nritems(parent);  	blocksize = btrfs_level_size(root, parent_level - 1); @@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		if (btrfs_header_nritems(right) == 0) {  			clean_tree_block(trans, root, right);  			btrfs_tree_unlock(right); -			del_ptr(trans, root, path, level + 1, pslot + 1, 1); +			del_ptr(trans, root, path, level + 1, pslot + 1);  			root_sub_used(root, right->len);  			btrfs_free_tree_block(trans, root, right, 0, 1);  			free_extent_buffer_stale(right); @@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  			struct btrfs_disk_key right_key;  			btrfs_node_key(right, &right_key, 0);  			tree_mod_log_set_node_key(root->fs_info, parent, -						  &right_key, pslot + 1, 0); +						  pslot + 1, 0);  			btrfs_set_node_key(parent, &right_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent);  		} @@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  	if (btrfs_header_nritems(mid) == 0) {  		clean_tree_block(trans, root, mid);  		btrfs_tree_unlock(mid); -		del_ptr(trans, root, path, level + 1, pslot, 1); +		del_ptr(trans, root, path, level + 1, pslot);  		root_sub_used(root, mid->len);  		btrfs_free_tree_block(trans, root, mid, 0, 1);  		free_extent_buffer_stale(mid); @@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,  		/* update the parent key to reflect our changes */  		struct btrfs_disk_key mid_key;  		btrfs_node_key(mid, &mid_key, 0); -		tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, +		tree_mod_log_set_node_key(root->fs_info, parent,  					  pslot, 0);  		btrfs_set_node_key(parent, &mid_key, pslot);  		btrfs_mark_buffer_dirty(parent); @@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			orig_slot += left_nr;  			btrfs_node_key(mid, &disk_key, 0);  			tree_mod_log_set_node_key(root->fs_info, parent, -						  &disk_key, pslot, 0); +						  pslot, 0);  			btrfs_set_node_key(parent, &disk_key, pslot);  			btrfs_mark_buffer_dirty(parent);  			if (btrfs_header_nritems(left) > orig_slot) { @@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,  			btrfs_node_key(right, &disk_key, 0);  			tree_mod_log_set_node_key(root->fs_info, parent, -						  &disk_key, pslot + 1, 0); +						  pslot + 1, 0);  			btrfs_set_node_key(parent, &disk_key, pslot + 1);  			btrfs_mark_buffer_dirty(parent); @@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,  	int no_skips = 0;  	struct extent_buffer *t; +	if (path->really_keep_locks) +		return; +  	for (i = level; i < BTRFS_MAX_LEVEL; i++) {  		if (!path->nodes[i])  			break; @@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)  {  	int i; -	if (path->keep_locks) +	if (path->keep_locks || path->really_keep_locks)  		return;  	for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root  	if (!cow)  		write_lock_level = -1; -	if (cow && (p->keep_locks || p->lowest_level)) +	if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))  		write_lock_level = BTRFS_MAX_LEVEL;  	min_write_lock_level = write_lock_level; @@ -2568,7 +2564,10 @@ again:  			 * must have write locks on this node and the  			 * parent  			 */ -			if (level + 1 > write_lock_level) { +			if (level > write_lock_level || +			    (level + 1 > write_lock_level && +			    level + 1 < BTRFS_MAX_LEVEL && +			    p->nodes[level + 1])) {  				write_lock_level = level + 1;  				btrfs_release_path(p);  				goto again; @@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,  		if (!path->nodes[i])  			break;  		t = path->nodes[i]; -		tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); +		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);  		btrfs_set_node_key(t, key, tslot);  		btrfs_mark_buffer_dirty(path->nodes[i]);  		if (tslot != 0) @@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,   */  static int leaf_space_used(struct extent_buffer *l, int start, int nr)  { +	struct btrfs_item *start_item; +	struct btrfs_item *end_item; +	struct btrfs_map_token token;  	int data_len;  	int nritems = btrfs_header_nritems(l);  	int end = min(nritems, start + nr) - 1;  	if (!nr)  		return 0; -	data_len = btrfs_item_end_nr(l, start); -	data_len = data_len - btrfs_item_offset_nr(l, end); +	btrfs_init_map_token(&token); +	start_item = btrfs_item_nr(l, start); +	end_item = btrfs_item_nr(l, end); +	data_len = btrfs_token_item_offset(l, start_item, &token) + +		btrfs_token_item_size(l, start_item, &token); +	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);  	data_len += sizeof(struct btrfs_item) * nr;  	WARN_ON(data_len < 0);  	return data_len; @@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,  	if (push_items == 0)  		goto out_unlock; -	if (!empty && push_items == left_nritems) -		WARN_ON(1); +	WARN_ON(!empty && push_items == left_nritems);  	/* push left to right */  	right_nritems = btrfs_header_nritems(right); @@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,  	btrfs_set_header_nritems(left, old_left_nritems + push_items);  	/* fixup right node */ -	if (push_items > right_nritems) { -		printk(KERN_CRIT "push items %d nr %u\n", push_items, +	if (push_items > right_nritems) +		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,  		       right_nritems); -		WARN_ON(1); -	}  	if (push_items < right_nritems) {  		push_space = btrfs_item_offset_nr(right, push_items - 1) - @@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root   * empty a node.   */  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, -		    struct btrfs_path *path, int level, int slot, -		    int tree_mod_log) +		    struct btrfs_path *path, int level, int slot)  {  	struct extent_buffer *parent = path->nodes[level];  	u32 nritems;  	int ret; +	if (level) { +		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, +					      MOD_LOG_KEY_REMOVE); +		BUG_ON(ret < 0); +	} +  	nritems = btrfs_header_nritems(parent);  	if (slot != nritems - 1) { -		if (tree_mod_log && level) +		if (level)  			tree_mod_log_eb_move(root->fs_info, parent, slot,  					     slot + 1, nritems - slot - 1);  		memmove_extent_buffer(parent, @@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,  			      btrfs_node_key_ptr_offset(slot + 1),  			      sizeof(struct btrfs_key_ptr) *  			      (nritems - slot - 1)); -	} else if (tree_mod_log && level) { -		ret = tree_mod_log_insert_key(root->fs_info, parent, slot, -					      MOD_LOG_KEY_REMOVE); -		BUG_ON(ret < 0);  	}  	nritems--; @@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,  				    struct extent_buffer *leaf)  {  	WARN_ON(btrfs_header_generation(leaf) != trans->transid); -	del_ptr(trans, root, path, 1, path->slots[1], 1); +	del_ptr(trans, root, path, 1, path->slots[1]);  	/*  	 * btrfs_free_extent is expensive, we want to make sure we @@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  	right_path->search_commit_root = 1;  	right_path->skip_locking = 1; -	spin_lock(&left_root->root_times_lock); +	spin_lock(&left_root->root_item_lock);  	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); -	spin_unlock(&left_root->root_times_lock); +	spin_unlock(&left_root->root_item_lock); -	spin_lock(&right_root->root_times_lock); +	spin_lock(&right_root->root_item_lock);  	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); -	spin_unlock(&right_root->root_times_lock); +	spin_unlock(&right_root->root_item_lock);  	trans = btrfs_join_transaction(left_root);  	if (IS_ERR(trans)) { @@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,  				goto out;  			} -			spin_lock(&left_root->root_times_lock); +			spin_lock(&left_root->root_item_lock);  			ctransid = btrfs_root_ctransid(&left_root->root_item); -			spin_unlock(&left_root->root_times_lock); +			spin_unlock(&left_root->root_item_lock);  			if (ctransid != left_start_ctransid)  				left_start_ctransid = 0; -			spin_lock(&right_root->root_times_lock); +			spin_lock(&right_root->root_item_lock);  			ctransid = btrfs_root_ctransid(&right_root->root_item); -			spin_unlock(&right_root->root_times_lock); +			spin_unlock(&right_root->root_item_lock);  			if (ctransid != right_start_ctransid)  				right_start_ctransid = 0; @@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)  	return btrfs_next_old_leaf(root, path, 0);  } +/* Release the path up to but not including the given level */ +static void btrfs_release_level(struct btrfs_path *path, int level) +{ +	int i; + +	for (i = 0; i < level; i++) { +		path->slots[i] = 0; +		if (!path->nodes[i]) +			continue; +		if (path->locks[i]) { +			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); +			path->locks[i] = 0; +		} +		free_extent_buffer(path->nodes[i]); +		path->nodes[i] = NULL; +	} +} + +/* + * This function assumes 2 things + * + * 1) You are using path->keep_locks + * 2) You are not inserting items. + * + * If either of these are not true do not use this function. If you need a next + * leaf with either of these not being true then this function can be easily + * adapted to do that, but at the moment these are the limitations. + */ +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct btrfs_path *path, +			  int del) +{ +	struct extent_buffer *b; +	struct btrfs_key key; +	u32 nritems; +	int level = 1; +	int slot; +	int ret = 1; +	int write_lock_level = BTRFS_MAX_LEVEL; +	int ins_len = del ? -1 : 0; + +	WARN_ON(!(path->keep_locks || path->really_keep_locks)); + +	nritems = btrfs_header_nritems(path->nodes[0]); +	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + +	while (path->nodes[level]) { +		nritems = btrfs_header_nritems(path->nodes[level]); +		if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { +search: +			btrfs_release_path(path); +			ret = btrfs_search_slot(trans, root, &key, path, +						ins_len, 1); +			if (ret < 0) +				goto out; +			level = 1; +			continue; +		} + +		if (path->slots[level] >= nritems - 1) { +			level++; +			continue; +		} + +		btrfs_release_level(path, level); +		break; +	} + +	if (!path->nodes[level]) { +		ret = 1; +		goto out; +	} + +	path->slots[level]++; +	b = path->nodes[level]; + +	while (b) { +		level = btrfs_header_level(b); + +		if (!should_cow_block(trans, root, b)) +			goto cow_done; + +		btrfs_set_path_blocking(path); +		ret = btrfs_cow_block(trans, root, b, +				      path->nodes[level + 1], +				      path->slots[level + 1], &b); +		if (ret) +			goto out; +cow_done: +		path->nodes[level] = b; +		btrfs_clear_path_blocking(path, NULL, 0); +		if (level != 0) { +			ret = setup_nodes_for_search(trans, root, path, b, +						     level, ins_len, +						     &write_lock_level); +			if (ret == -EAGAIN) +				goto search; +			if (ret) +				goto out; + +			b = path->nodes[level]; +			slot = path->slots[level]; + +			ret = read_block_for_search(trans, root, path, +						    &b, level, slot, &key, 0); +			if (ret == -EAGAIN) +				goto search; +			if (ret) +				goto out; +			level = btrfs_header_level(b); +			if (!btrfs_try_tree_write_lock(b)) { +				btrfs_set_path_blocking(path); +				btrfs_tree_lock(b); +				btrfs_clear_path_blocking(path, b, +							  BTRFS_WRITE_LOCK); +			} +			path->locks[level] = BTRFS_WRITE_LOCK; +			path->nodes[level] = b; +			path->slots[level] = 0; +		} else { +			path->slots[level] = 0; +			ret = 0; +			break; +		} +	} + +out: +	if (ret) +		btrfs_release_path(path); + +	return ret; +} +  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,  			u64 time_seq)  { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c72ead869507..547b7b05727f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,7 +48,7 @@ struct btrfs_ordered_sum;  #define BTRFS_MAGIC "_BHRfS_M" -#define BTRFS_MAX_MIRRORS 2 +#define BTRFS_MAX_MIRRORS 3  #define BTRFS_MAX_LEVEL 8 @@ -142,6 +142,8 @@ struct btrfs_ordered_sum;  #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 +#define BTRFS_DEV_REPLACE_DEVID 0 +  /*   * the max metadata block size.  This limit is somewhat artificial,   * but the memmove costs go through the roof for larger blocks. @@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };  /* four bytes for CRC32 */  #define BTRFS_EMPTY_DIR_SIZE 0 +/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +#define REQ_GET_READ_MIRRORS	(1 << 30) +  #define BTRFS_FT_UNKNOWN	0  #define BTRFS_FT_REG_FILE	1  #define BTRFS_FT_DIR		2 @@ -413,7 +418,7 @@ struct btrfs_root_backup {  	__le64 bytes_used;  	__le64 num_devices;  	/* future */ -	__le64 unsed_64[4]; +	__le64 unused_64[4];  	u8 tree_root_level;  	u8 chunk_root_level; @@ -571,6 +576,7 @@ struct btrfs_path {  	unsigned int skip_locking:1;  	unsigned int leave_spinning:1;  	unsigned int search_commit_root:1; +	unsigned int really_keep_locks:1;  };  /* @@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {  	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];  } __attribute__ ((__packed__)); +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4 + +struct btrfs_dev_replace { +	u64 replace_state;	/* see #define above */ +	u64 time_started;	/* seconds since 1-Jan-1970 */ +	u64 time_stopped;	/* seconds since 1-Jan-1970 */ +	atomic64_t num_write_errors; +	atomic64_t num_uncorrectable_read_errors; + +	u64 cursor_left; +	u64 committed_cursor_left; +	u64 cursor_left_last_write_of_item; +	u64 cursor_right; + +	u64 cont_reading_from_srcdev_mode;	/* see #define above */ + +	int is_valid; +	int item_needs_writeback; +	struct btrfs_device *srcdev; +	struct btrfs_device *tgtdev; + +	pid_t lock_owner; +	atomic_t nesting_level; +	struct mutex lock_finishing_cancel_unmount; +	struct mutex lock_management_lock; +	struct mutex lock; + +	struct btrfs_scrub_progress scrub_progress; +}; + +struct btrfs_dev_replace_item { +	/* +	 * grow this item struct at the end for future enhancements and keep +	 * the existing values unchanged +	 */ +	__le64 src_devid; +	__le64 cursor_left; +	__le64 cursor_right; +	__le64 cont_reading_from_srcdev_mode; + +	__le64 replace_state; +	__le64 time_started; +	__le64 time_stopped; +	__le64 num_write_errors; +	__le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); +  /* different types of block groups (and chunks) */  #define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)  #define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1) @@ -1333,6 +1392,7 @@ struct btrfs_fs_info {  	struct btrfs_workers generic_worker;  	struct btrfs_workers workers;  	struct btrfs_workers delalloc_workers; +	struct btrfs_workers flush_workers;  	struct btrfs_workers endio_workers;  	struct btrfs_workers endio_meta_workers;  	struct btrfs_workers endio_meta_write_workers; @@ -1429,6 +1489,8 @@ struct btrfs_fs_info {  	struct rw_semaphore scrub_super_lock;  	int scrub_workers_refcnt;  	struct btrfs_workers scrub_workers; +	struct btrfs_workers scrub_wr_completion_workers; +	struct btrfs_workers scrub_nocow_workers;  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY  	u32 check_integrity_print_mask; @@ -1470,6 +1532,11 @@ struct btrfs_fs_info {  	int backup_root_index;  	int num_tolerated_disk_barrier_failures; + +	/* device replace state */ +	struct btrfs_dev_replace dev_replace; + +	atomic_t mutually_exclusive_operation_running;  };  /* @@ -1579,7 +1646,7 @@ struct btrfs_root {  	int force_cow; -	spinlock_t root_times_lock; +	spinlock_t root_item_lock;  };  struct btrfs_ioctl_defrag_range_args { @@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {  #define BTRFS_DEV_STATS_KEY	249  /* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY	250 + +/*   * string items are for debugging.  They just store a short string of   * data in the FS   */ @@ -1787,7 +1860,7 @@ struct btrfs_map_token {  static inline void btrfs_init_map_token (struct btrfs_map_token *token)  { -	memset(token, 0, sizeof(*token)); +	token->kaddr = NULL;  }  /* some macros to generate set/get funcs for the struct fields.  This @@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,  BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,  		   rsv_excl, 64); +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, +		   struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, +		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, +		   64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, +		   replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, +		   time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, +		   time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, +		   num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, +		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors, +		   64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, +		   cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, +		   cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, +			 struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, +			 struct btrfs_dev_replace_item, +			 cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, +			 struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, +			 struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, +			 struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, +			 struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, +			 struct btrfs_dev_replace_item, +			 num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, +			 struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, +			 struct btrfs_dev_replace_item, cursor_right, 64); +  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)  {  	return sb->s_fs_info; @@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);  void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + +enum btrfs_reserve_flush_enum { +	/* If we are in the transaction, we can't flush anything.*/ +	BTRFS_RESERVE_NO_FLUSH, +	/* +	 * Flushing delalloc may cause deadlock somewhere, in this +	 * case, use FLUSH LIMIT +	 */ +	BTRFS_RESERVE_FLUSH_LIMIT, +	BTRFS_RESERVE_FLUSH_ALL, +}; +  int btrfs_check_data_free_space(struct inode *inode, u64 bytes);  void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,  void btrfs_free_block_rsv(struct btrfs_root *root,  			  struct btrfs_block_rsv *rsv);  int btrfs_block_rsv_add(struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, -			u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, -				struct btrfs_block_rsv *block_rsv, -				u64 num_bytes); +			struct btrfs_block_rsv *block_rsv, u64 num_bytes, +			enum btrfs_reserve_flush_enum flush);  int btrfs_block_rsv_check(struct btrfs_root *root,  			  struct btrfs_block_rsv *block_rsv, int min_factor);  int btrfs_block_rsv_refill(struct btrfs_root *root, -			  struct btrfs_block_rsv *block_rsv, -			  u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, -				   struct btrfs_block_rsv *block_rsv, -				   u64 min_reserved); +			   struct btrfs_block_rsv *block_rsv, u64 min_reserved, +			   enum btrfs_reserve_flush_enum flush);  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,  			    struct btrfs_block_rsv *dst_rsv,  			    u64 num_bytes); @@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);  int btrfs_init_space_info(struct btrfs_fs_info *fs_info);  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,  					 struct btrfs_fs_info *fs_info); +int __get_raid_index(u64 flags);  /* ctree.c */  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,  		     int level, int *slot); @@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,  }  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, +			  struct btrfs_root *root, struct btrfs_path *path, +			  int del);  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,  			u64 time_seq);  static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root);  /* dir-item.c */ +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +			  const char *name, int name_len);  int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root, const char *name,  			  int name_len, struct inode *dir, @@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct btrfs_path *path, u64 objectid,  			     u64 bytenr, int mod); +u64 btrfs_file_extent_length(struct btrfs_path *path);  int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root,  			   struct btrfs_ordered_sum *sums); @@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,  int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,  			     struct list_head *list, int search_commit);  /* inode.c */ +struct btrfs_delalloc_work { +	struct inode *inode; +	int wait; +	int delay_iput; +	struct completion completion; +	struct list_head list; +	struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, +						    int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); +  struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,  					   size_t pg_offset, u64 start, u64 len,  					   int create); @@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,  				struct btrfs_ioctl_space_info *space);  /* file.c */ +int btrfs_auto_defrag_init(void); +void btrfs_auto_defrag_exit(void);  int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  			   struct inode *inode);  int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);  void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			     int skip_pinned); @@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,  			      struct btrfs_pending_snapshot *pending);  /* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, -		    struct btrfs_scrub_progress *progress, int readonly); +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, +		    u64 end, struct btrfs_scrub_progress *progress, +		    int readonly, int is_dev_replace);  void btrfs_scrub_pause(struct btrfs_root *root);  void btrfs_scrub_pause_super(struct btrfs_root *root);  void btrfs_scrub_continue(struct btrfs_root *root);  void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, +			   struct btrfs_device *dev);  int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);  int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,  			 struct btrfs_scrub_progress *progress); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 478f66bdc57b..34836036f01b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(  	 */  	if (!src_rsv || (!trans->bytes_reserved &&  			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { -		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); +		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, +					  BTRFS_RESERVE_NO_FLUSH);  		/*  		 * Since we're under a transaction reserve_metadata_bytes could  		 * try to commit the transaction which will make it return @@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(  		 * reserve something strictly for us.  If not be a pain and try  		 * to steal from the delalloc block rsv.  		 */ -		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); +		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, +					  BTRFS_RESERVE_NO_FLUSH);  		if (!ret)  			goto out; @@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  	struct btrfs_delayed_node *delayed_node = NULL;  	struct btrfs_root *root;  	struct btrfs_block_rsv *block_rsv; -	unsigned long nr = 0;  	int need_requeue = 0;  	int ret; @@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)  					   delayed_node);  	mutex_unlock(&delayed_node->mutex); -	nr = trans->blocks_used; -  	trans->block_rsv = block_rsv;  	btrfs_end_transaction_dmeta(trans, root); -	__btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty_nodelay(root);  free_path:  	btrfs_free_path(path);  out: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c new file mode 100644 index 000000000000..66dbc8dbddf7 --- /dev/null +++ b/fs/btrfs/dev-replace.c @@ -0,0 +1,856 @@ +/* + * Copyright (C) STRATO AG 2012.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/iocontext.h> +#include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" +#include "dev-replace.h" + +static u64 btrfs_get_seconds_since_1970(void); +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, +				       int scrub_ret); +static void btrfs_dev_replace_update_device_in_mapping_tree( +						struct btrfs_fs_info *fs_info, +						struct btrfs_device *srcdev, +						struct btrfs_device *tgtdev); +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, +					 char *srcdev_name, +					 struct btrfs_device **device); +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); +static int btrfs_dev_replace_kthread(void *data); +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); + + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_key key; +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct extent_buffer *eb; +	int slot; +	int ret = 0; +	struct btrfs_path *path = NULL; +	int item_size; +	struct btrfs_dev_replace_item *ptr; +	u64 src_devid; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} + +	key.objectid = 0; +	key.type = BTRFS_DEV_REPLACE_KEY; +	key.offset = 0; +	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); +	if (ret) { +no_valid_dev_replace_entry_found: +		ret = 0; +		dev_replace->replace_state = +			BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; +		dev_replace->cont_reading_from_srcdev_mode = +		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; +		dev_replace->replace_state = 0; +		dev_replace->time_started = 0; +		dev_replace->time_stopped = 0; +		atomic64_set(&dev_replace->num_write_errors, 0); +		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); +		dev_replace->cursor_left = 0; +		dev_replace->committed_cursor_left = 0; +		dev_replace->cursor_left_last_write_of_item = 0; +		dev_replace->cursor_right = 0; +		dev_replace->srcdev = NULL; +		dev_replace->tgtdev = NULL; +		dev_replace->is_valid = 0; +		dev_replace->item_needs_writeback = 0; +		goto out; +	} +	slot = path->slots[0]; +	eb = path->nodes[0]; +	item_size = btrfs_item_size_nr(eb, slot); +	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); + +	if (item_size != sizeof(struct btrfs_dev_replace_item)) { +		pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); +		goto no_valid_dev_replace_entry_found; +	} + +	src_devid = btrfs_dev_replace_src_devid(eb, ptr); +	dev_replace->cont_reading_from_srcdev_mode = +		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); +	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); +	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); +	dev_replace->time_stopped = +		btrfs_dev_replace_time_stopped(eb, ptr); +	atomic64_set(&dev_replace->num_write_errors, +		     btrfs_dev_replace_num_write_errors(eb, ptr)); +	atomic64_set(&dev_replace->num_uncorrectable_read_errors, +		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); +	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); +	dev_replace->committed_cursor_left = dev_replace->cursor_left; +	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; +	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); +	dev_replace->is_valid = 1; + +	dev_replace->item_needs_writeback = 0; +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		dev_replace->srcdev = NULL; +		dev_replace->tgtdev = NULL; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, +							NULL, NULL); +		dev_replace->tgtdev = btrfs_find_device(fs_info, +							BTRFS_DEV_REPLACE_DEVID, +							NULL, NULL); +		/* +		 * allow 'btrfs dev replace_cancel' if src/tgt device is +		 * missing +		 */ +		if (!dev_replace->srcdev && +		    !btrfs_test_opt(dev_root, DEGRADED)) { +			ret = -EIO; +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", +				(unsigned long long)src_devid); +		} +		if (!dev_replace->tgtdev && +		    !btrfs_test_opt(dev_root, DEGRADED)) { +			ret = -EIO; +			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", +				(unsigned long long)BTRFS_DEV_REPLACE_DEVID); +		} +		if (dev_replace->tgtdev) { +			if (dev_replace->srcdev) { +				dev_replace->tgtdev->total_bytes = +					dev_replace->srcdev->total_bytes; +				dev_replace->tgtdev->disk_total_bytes = +					dev_replace->srcdev->disk_total_bytes; +				dev_replace->tgtdev->bytes_used = +					dev_replace->srcdev->bytes_used; +			} +			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; +			btrfs_init_dev_replace_tgtdev_for_resume(fs_info, +				dev_replace->tgtdev); +		} +		break; +	} + +out: +	if (path) +		btrfs_free_path(path); +	return ret; +} + +/* + * called from commit_transaction. Writes changed device replace state to + * disk. + */ +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, +			  struct btrfs_fs_info *fs_info) +{ +	int ret; +	struct btrfs_root *dev_root = fs_info->dev_root; +	struct btrfs_path *path; +	struct btrfs_key key; +	struct extent_buffer *eb; +	struct btrfs_dev_replace_item *ptr; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	if (!dev_replace->is_valid || +	    !dev_replace->item_needs_writeback) { +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	} +	btrfs_dev_replace_unlock(dev_replace); + +	key.objectid = 0; +	key.type = BTRFS_DEV_REPLACE_KEY; +	key.offset = 0; + +	path = btrfs_alloc_path(); +	if (!path) { +		ret = -ENOMEM; +		goto out; +	} +	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); +	if (ret < 0) { +		pr_warn("btrfs: error %d while searching for dev_replace item!\n", +			ret); +		goto out; +	} + +	if (ret == 0 && +	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { +		/* +		 * need to delete old one and insert a new one. +		 * Since no attempt is made to recover any old state, if the +		 * dev_replace state is 'running', the data on the target +		 * drive is lost. +		 * It would be possible to recover the state: just make sure +		 * that the beginning of the item is never changed and always +		 * contains all the essential information. Then read this +		 * minimal set of information and use it as a base for the +		 * new state. +		 */ +		ret = btrfs_del_item(trans, dev_root, path); +		if (ret != 0) { +			pr_warn("btrfs: delete too small dev_replace item failed %d!\n", +				ret); +			goto out; +		} +		ret = 1; +	} + +	if (ret == 1) { +		/* need to insert a new item */ +		btrfs_release_path(path); +		ret = btrfs_insert_empty_item(trans, dev_root, path, +					      &key, sizeof(*ptr)); +		if (ret < 0) { +			pr_warn("btrfs: insert dev_replace item failed %d!\n", +				ret); +			goto out; +		} +	} + +	eb = path->nodes[0]; +	ptr = btrfs_item_ptr(eb, path->slots[0], +			     struct btrfs_dev_replace_item); + +	btrfs_dev_replace_lock(dev_replace); +	if (dev_replace->srcdev) +		btrfs_set_dev_replace_src_devid(eb, ptr, +			dev_replace->srcdev->devid); +	else +		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); +	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, +		dev_replace->cont_reading_from_srcdev_mode); +	btrfs_set_dev_replace_replace_state(eb, ptr, +		dev_replace->replace_state); +	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); +	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); +	btrfs_set_dev_replace_num_write_errors(eb, ptr, +		atomic64_read(&dev_replace->num_write_errors)); +	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, +		atomic64_read(&dev_replace->num_uncorrectable_read_errors)); +	dev_replace->cursor_left_last_write_of_item = +		dev_replace->cursor_left; +	btrfs_set_dev_replace_cursor_left(eb, ptr, +		dev_replace->cursor_left_last_write_of_item); +	btrfs_set_dev_replace_cursor_right(eb, ptr, +		dev_replace->cursor_right); +	dev_replace->item_needs_writeback = 0; +	btrfs_dev_replace_unlock(dev_replace); + +	btrfs_mark_buffer_dirty(eb); + +out: +	btrfs_free_path(path); + +	return ret; +} + +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	dev_replace->committed_cursor_left = +		dev_replace->cursor_left_last_write_of_item; +} + +static u64 btrfs_get_seconds_since_1970(void) +{ +	struct timespec t = CURRENT_TIME_SEC; + +	return t.tv_sec; +} + +int btrfs_dev_replace_start(struct btrfs_root *root, +			    struct btrfs_ioctl_dev_replace_args *args) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int ret; +	struct btrfs_device *tgt_device = NULL; +	struct btrfs_device *src_device = NULL; + +	switch (args->start.cont_reading_from_srcdev_mode) { +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: +	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: +		break; +	default: +		return -EINVAL; +	} + +	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || +	    args->start.tgtdev_name[0] == '\0') +		return -EINVAL; + +	mutex_lock(&fs_info->volume_mutex); +	ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, +					    &tgt_device); +	if (ret) { +		pr_err("btrfs: target device %s is invalid!\n", +		       args->start.tgtdev_name); +		mutex_unlock(&fs_info->volume_mutex); +		return -EINVAL; +	} + +	ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, +					    args->start.srcdev_name, +					    &src_device); +	mutex_unlock(&fs_info->volume_mutex); +	if (ret) { +		ret = -EINVAL; +		goto leave_no_lock; +	} + +	if (tgt_device->total_bytes < src_device->total_bytes) { +		pr_err("btrfs: target device is smaller than source device!\n"); +		ret = -EINVAL; +		goto leave_no_lock; +	} + +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; +		goto leave; +	} + +	dev_replace->cont_reading_from_srcdev_mode = +		args->start.cont_reading_from_srcdev_mode; +	WARN_ON(!src_device); +	dev_replace->srcdev = src_device; +	WARN_ON(!tgt_device); +	dev_replace->tgtdev = tgt_device; + +	printk_in_rcu(KERN_INFO +		      "btrfs: dev_replace from %s (devid %llu) to %s) started\n", +		      src_device->missing ? "<missing disk>" : +		        rcu_str_deref(src_device->name), +		      src_device->devid, +		      rcu_str_deref(tgt_device->name)); + +	tgt_device->total_bytes = src_device->total_bytes; +	tgt_device->disk_total_bytes = src_device->disk_total_bytes; +	tgt_device->bytes_used = src_device->bytes_used; + +	/* +	 * from now on, the writes to the srcdev are all duplicated to +	 * go to the tgtdev as well (refer to btrfs_map_block()). +	 */ +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; +	dev_replace->time_started = btrfs_get_seconds_since_1970(); +	dev_replace->cursor_left = 0; +	dev_replace->committed_cursor_left = 0; +	dev_replace->cursor_left_last_write_of_item = 0; +	dev_replace->cursor_right = 0; +	dev_replace->is_valid = 1; +	dev_replace->item_needs_writeback = 1; +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +	btrfs_dev_replace_unlock(dev_replace); + +	btrfs_wait_ordered_extents(root, 0); + +	/* force writing the updated state information to disk */ +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		ret = PTR_ERR(trans); +		btrfs_dev_replace_lock(dev_replace); +		goto leave; +	} + +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); + +	/* the disk copy procedure reuses the scrub code */ +	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, +			      src_device->total_bytes, +			      &dev_replace->scrub_progress, 0, 1); + +	ret = btrfs_dev_replace_finishing(root->fs_info, ret); +	WARN_ON(ret); + +	return 0; + +leave: +	dev_replace->srcdev = NULL; +	dev_replace->tgtdev = NULL; +	btrfs_dev_replace_unlock(dev_replace); +leave_no_lock: +	if (tgt_device) +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); +	return ret; +} + +static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, +				       int scrub_ret) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_device *tgt_device; +	struct btrfs_device *src_device; +	struct btrfs_root *root = fs_info->tree_root; +	u8 uuid_tmp[BTRFS_UUID_SIZE]; +	struct btrfs_trans_handle *trans; +	int ret = 0; + +	/* don't allow cancel or unmount to disturb the finishing procedure */ +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); + +	btrfs_dev_replace_lock(dev_replace); +	/* was the operation canceled, or is it finished? */ +	if (dev_replace->replace_state != +	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { +		btrfs_dev_replace_unlock(dev_replace); +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return 0; +	} + +	tgt_device = dev_replace->tgtdev; +	src_device = dev_replace->srcdev; +	btrfs_dev_replace_unlock(dev_replace); + +	/* replace old device with new one in mapping tree */ +	if (!scrub_ret) +		btrfs_dev_replace_update_device_in_mapping_tree(fs_info, +								src_device, +								tgt_device); + +	/* +	 * flush all outstanding I/O and inode extent mappings before the +	 * copy operation is declared as being finished +	 */ +	btrfs_start_delalloc_inodes(root, 0); +	btrfs_wait_ordered_extents(root, 0); + +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return PTR_ERR(trans); +	} +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); + +	/* keep away write_all_supers() during the finishing procedure */ +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); +	btrfs_dev_replace_lock(dev_replace); +	dev_replace->replace_state = +		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED +			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; +	dev_replace->tgtdev = NULL; +	dev_replace->srcdev = NULL; +	dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +	dev_replace->item_needs_writeback = 1; + +	if (scrub_ret) { +		printk_in_rcu(KERN_ERR +			      "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", +			      src_device->missing ? "<missing disk>" : +			        rcu_str_deref(src_device->name), +			      src_device->devid, +			      rcu_str_deref(tgt_device->name), scrub_ret); +		btrfs_dev_replace_unlock(dev_replace); +		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +		if (tgt_device) +			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + +		return 0; +	} + +	printk_in_rcu(KERN_INFO +		      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", +		      src_device->missing ? "<missing disk>" : +		        rcu_str_deref(src_device->name), +		      src_device->devid, +		      rcu_str_deref(tgt_device->name)); +	tgt_device->is_tgtdev_for_dev_replace = 0; +	tgt_device->devid = src_device->devid; +	src_device->devid = BTRFS_DEV_REPLACE_DEVID; +	tgt_device->bytes_used = src_device->bytes_used; +	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); +	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); +	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); +	tgt_device->total_bytes = src_device->total_bytes; +	tgt_device->disk_total_bytes = src_device->disk_total_bytes; +	tgt_device->bytes_used = src_device->bytes_used; +	if (fs_info->sb->s_bdev == src_device->bdev) +		fs_info->sb->s_bdev = tgt_device->bdev; +	if (fs_info->fs_devices->latest_bdev == src_device->bdev) +		fs_info->fs_devices->latest_bdev = tgt_device->bdev; +	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + +	btrfs_rm_dev_replace_srcdev(fs_info, src_device); +	if (src_device->bdev) { +		/* zero out the old super */ +		btrfs_scratch_superblock(src_device); +	} +	/* +	 * this is again a consistent state where no dev_replace procedure +	 * is running, the target device is part of the filesystem, the +	 * source device is not part of the filesystem anymore and its 1st +	 * superblock is scratched out so that it is no longer marked to +	 * belong to this filesystem. +	 */ +	btrfs_dev_replace_unlock(dev_replace); +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + +	/* write back the superblocks */ +	trans = btrfs_start_transaction(root, 0); +	if (!IS_ERR(trans)) +		btrfs_commit_transaction(trans, root); + +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + +	return 0; +} + +static void btrfs_dev_replace_update_device_in_mapping_tree( +						struct btrfs_fs_info *fs_info, +						struct btrfs_device *srcdev, +						struct btrfs_device *tgtdev) +{ +	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; +	struct extent_map *em; +	struct map_lookup *map; +	u64 start = 0; +	int i; + +	write_lock(&em_tree->lock); +	do { +		em = lookup_extent_mapping(em_tree, start, (u64)-1); +		if (!em) +			break; +		map = (struct map_lookup *)em->bdev; +		for (i = 0; i < map->num_stripes; i++) +			if (srcdev == map->stripes[i].dev) +				map->stripes[i].dev = tgtdev; +		start = em->start + em->len; +		free_extent_map(em); +	} while (start); +	write_unlock(&em_tree->lock); +} + +static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, +					 char *srcdev_name, +					 struct btrfs_device **device) +{ +	int ret; + +	if (srcdevid) { +		ret = 0; +		*device = btrfs_find_device(root->fs_info, srcdevid, NULL, +					    NULL); +		if (!*device) +			ret = -ENOENT; +	} else { +		ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, +							   device); +	} +	return ret; +} + +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, +			      struct btrfs_ioctl_dev_replace_args *args) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	/* even if !dev_replace_is_valid, the values are good enough for +	 * the replace_status ioctl */ +	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +	args->status.replace_state = dev_replace->replace_state; +	args->status.time_started = dev_replace->time_started; +	args->status.time_stopped = dev_replace->time_stopped; +	args->status.num_write_errors = +		atomic64_read(&dev_replace->num_write_errors); +	args->status.num_uncorrectable_read_errors = +		atomic64_read(&dev_replace->num_uncorrectable_read_errors); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		args->status.progress_1000 = 0; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +		args->status.progress_1000 = 1000; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		args->status.progress_1000 = div64_u64(dev_replace->cursor_left, +			div64_u64(dev_replace->srcdev->total_bytes, 1000)); +		break; +	} +	btrfs_dev_replace_unlock(dev_replace); +} + +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, +			     struct btrfs_ioctl_dev_replace_args *args) +{ +	args->result = __btrfs_dev_replace_cancel(fs_info); +	return 0; +} + +static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_device *tgt_device = NULL; +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = fs_info->tree_root; +	u64 result; +	int ret; + +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; +		btrfs_dev_replace_unlock(dev_replace); +		goto leave; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; +		tgt_device = dev_replace->tgtdev; +		dev_replace->tgtdev = NULL; +		dev_replace->srcdev = NULL; +		break; +	} +	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; +	dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +	dev_replace->item_needs_writeback = 1; +	btrfs_dev_replace_unlock(dev_replace); +	btrfs_scrub_cancel(fs_info); + +	trans = btrfs_start_transaction(root, 0); +	if (IS_ERR(trans)) { +		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +		return PTR_ERR(trans); +	} +	ret = btrfs_commit_transaction(trans, root); +	WARN_ON(ret); +	if (tgt_device) +		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + +leave: +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +	return result; +} + +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +		dev_replace->replace_state = +			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; +		dev_replace->time_stopped = btrfs_get_seconds_since_1970(); +		dev_replace->item_needs_writeback = 1; +		pr_info("btrfs: suspending dev_replace for unmount\n"); +		break; +	} + +	btrfs_dev_replace_unlock(dev_replace); +	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); +} + +/* resume dev_replace procedure that was interrupted by unmount */ +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) +{ +	struct task_struct *task; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + +	btrfs_dev_replace_lock(dev_replace); +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +		break; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		dev_replace->replace_state = +			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; +		break; +	} +	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { +		pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" +			"btrfs: you may cancel the operation after 'mount -o degraded'\n"); +		btrfs_dev_replace_unlock(dev_replace); +		return 0; +	} +	btrfs_dev_replace_unlock(dev_replace); + +	WARN_ON(atomic_xchg( +		&fs_info->mutually_exclusive_operation_running, 1)); +	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); +	return PTR_RET(task); +} + +static int btrfs_dev_replace_kthread(void *data) +{ +	struct btrfs_fs_info *fs_info = data; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	struct btrfs_ioctl_dev_replace_args *status_args; +	u64 progress; + +	status_args = kzalloc(sizeof(*status_args), GFP_NOFS); +	if (status_args) { +		btrfs_dev_replace_status(fs_info, status_args); +		progress = status_args->status.progress_1000; +		kfree(status_args); +		do_div(progress, 10); +		printk_in_rcu(KERN_INFO +			      "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", +			      dev_replace->srcdev->missing ? "<missing disk>" : +				rcu_str_deref(dev_replace->srcdev->name), +			      dev_replace->srcdev->devid, +			      dev_replace->tgtdev ? +				rcu_str_deref(dev_replace->tgtdev->name) : +				"<missing target disk>", +			      (unsigned int)progress); +	} +	btrfs_dev_replace_continue_on_mount(fs_info); +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + +	return 0; +} + +static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) +{ +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int ret; + +	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, +			      dev_replace->committed_cursor_left, +			      dev_replace->srcdev->total_bytes, +			      &dev_replace->scrub_progress, 0, 1); +	ret = btrfs_dev_replace_finishing(fs_info, ret); +	WARN_ON(ret); +	return 0; +} + +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +{ +	if (!dev_replace->is_valid) +		return 0; + +	switch (dev_replace->replace_state) { +	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: +		return 0; +	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: +	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: +		/* +		 * return true even if tgtdev is missing (this is +		 * something that can happen if the dev_replace +		 * procedure is suspended by an umount and then +		 * the tgtdev is missing (or "btrfs dev scan") was +		 * not called and the the filesystem is remounted +		 * in degraded state. This does not stop the +		 * dev_replace procedure. It needs to be canceled +		 * manually if the cancelation is wanted. +		 */ +		break; +	} +	return 1; +} + +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +{ +	/* the beginning is just an optimization for the typical case */ +	if (atomic_read(&dev_replace->nesting_level) == 0) { +acquire_lock: +		/* this is not a nested case where the same thread +		 * is trying to acqurire the same lock twice */ +		mutex_lock(&dev_replace->lock); +		mutex_lock(&dev_replace->lock_management_lock); +		dev_replace->lock_owner = current->pid; +		atomic_inc(&dev_replace->nesting_level); +		mutex_unlock(&dev_replace->lock_management_lock); +		return; +	} + +	mutex_lock(&dev_replace->lock_management_lock); +	if (atomic_read(&dev_replace->nesting_level) > 0 && +	    dev_replace->lock_owner == current->pid) { +		WARN_ON(!mutex_is_locked(&dev_replace->lock)); +		atomic_inc(&dev_replace->nesting_level); +		mutex_unlock(&dev_replace->lock_management_lock); +		return; +	} + +	mutex_unlock(&dev_replace->lock_management_lock); +	goto acquire_lock; +} + +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +{ +	WARN_ON(!mutex_is_locked(&dev_replace->lock)); +	mutex_lock(&dev_replace->lock_management_lock); +	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); +	WARN_ON(dev_replace->lock_owner != current->pid); +	atomic_dec(&dev_replace->nesting_level); +	if (atomic_read(&dev_replace->nesting_level) == 0) { +		dev_replace->lock_owner = 0; +		mutex_unlock(&dev_replace->lock_management_lock); +		mutex_unlock(&dev_replace->lock); +	} else { +		mutex_unlock(&dev_replace->lock_management_lock); +	} +} diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h new file mode 100644 index 000000000000..20035cbbf021 --- /dev/null +++ b/fs/btrfs/dev-replace.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) STRATO AG 2012.  All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#if !defined(__BTRFS_DEV_REPLACE__) +#define __BTRFS_DEV_REPLACE__ + +struct btrfs_ioctl_dev_replace_args; + +int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, +			  struct btrfs_fs_info *fs_info); +void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_start(struct btrfs_root *root, +			    struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, +			      struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, +			     struct btrfs_ioctl_dev_replace_args *args); +void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); +int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); +int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace); +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace); + +static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) +{ +	atomic64_inc(stat_value); +} +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c1a074d0696f..502c2158167c 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,  	return btrfs_match_dir_item_name(root, path, name, name_len);  } +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, +				   const char *name, int name_len) +{ +	int ret; +	struct btrfs_key key; +	struct btrfs_dir_item *di; +	int data_size; +	struct extent_buffer *leaf; +	int slot; +	struct btrfs_path *path; + + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; + +	key.objectid = dir; +	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); +	key.offset = btrfs_name_hash(name, name_len); + +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + +	/* return back any errors */ +	if (ret < 0) +		goto out; + +	/* nothing found, we're safe */ +	if (ret > 0) { +		ret = 0; +		goto out; +	} + +	/* we found an item, look for our name in the item */ +	di = btrfs_match_dir_item_name(root, path, name, name_len); +	if (di) { +		/* our exact name was found */ +		ret = -EEXIST; +		goto out; +	} + +	/* +	 * see if there is room in the item to insert this +	 * name +	 */ +	data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); +	leaf = path->nodes[0]; +	slot = path->slots[0]; +	if (data_size + btrfs_item_size_nr(leaf, slot) + +	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { +		ret = -EOVERFLOW; +	} else { +		/* plenty of insertion room */ +		ret = 0; +	} +out: +	btrfs_free_path(path); +	return ret; +} +  /*   * lookup a directory item based on index.  'dir' is the objectid   * we're searching in, and 'mod' tells us if you plan on deleting the diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda51995c1e..a8f652dc940b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@  #include "inode-map.h"  #include "check-integrity.h"  #include "rcu-string.h" +#include "dev-replace.h"  #ifdef CONFIG_X86  #include <asm/cpufeature.h> @@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,  		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))  			break; -		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, +		num_copies = btrfs_num_copies(root->fs_info,  					      eb->start, eb->len);  		if (num_copies == 1)  			break; @@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,  				 int mirror_num, unsigned long bio_flags,  				 u64 bio_offset)  { +	int ret; +  	/*  	 * when we're called for a write, we're already in the async  	 * submission context.  Just jump into btrfs_map_bio  	 */ -	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +	if (ret) +		bio_endio(bio, ret); +	return ret;  }  static int check_async_write(struct inode *inode, unsigned long bio_flags) @@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	int ret;  	if (!(rw & REQ_WRITE)) { -  		/*  		 * called for a read, do the setup so that checksum validation  		 * can happen in the async kernel threads @@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,  					  bio, 1);  		if (ret) -			return ret; -		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, -				     mirror_num, 0); +			goto out_w_error; +		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, +				    mirror_num, 0);  	} else if (!async) {  		ret = btree_csum_one_bio(bio);  		if (ret) -			return ret; -		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, -				     mirror_num, 0); +			goto out_w_error; +		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, +				    mirror_num, 0); +	} else { +		/* +		 * kthread helpers are used to submit writes so that +		 * checksumming can happen in parallel across all CPUs +		 */ +		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, +					  inode, rw, bio, mirror_num, 0, +					  bio_offset, +					  __btree_submit_bio_start, +					  __btree_submit_bio_done);  	} -	/* -	 * kthread helpers are used to submit writes so that checksumming -	 * can happen in parallel across all CPUs -	 */ -	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, -				   inode, rw, bio, mirror_num, 0, -				   bio_offset, -				   __btree_submit_bio_start, -				   __btree_submit_bio_done); +	if (ret) { +out_w_error: +		bio_endio(bio, ret); +	} +	return ret;  }  #ifdef CONFIG_MIGRATION @@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)  static int btree_set_page_dirty(struct page *page)  { +#ifdef DEBUG  	struct extent_buffer *eb;  	BUG_ON(!PagePrivate(page)); @@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)  	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));  	BUG_ON(!atomic_read(&eb->refs));  	btrfs_assert_tree_locked(eb); +#endif  	return __set_page_dirty_nobuffers(page);  } @@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,  					  root->fs_info->dirty_metadata_bytes);  			}  			spin_unlock(&root->fs_info->delalloc_lock); -		} -		/* ugh, clear_extent_buffer_dirty needs to lock the page */ -		btrfs_set_lock_blocking(buf); -		clear_extent_buffer_dirty(buf); +			/* ugh, clear_extent_buffer_dirty needs to lock the page */ +			btrfs_set_lock_blocking(buf); +			clear_extent_buffer_dirty(buf); +		}  	}  } @@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,  	root->root_key.objectid = objectid;  	root->anon_dev = 0; -	spin_lock_init(&root->root_times_lock); +	spin_lock_init(&root->root_item_lock);  }  static int __must_check find_and_setup_root(struct btrfs_root *tree_root, @@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,  	init_rwsem(&fs_info->extent_commit_sem);  	init_rwsem(&fs_info->cleanup_work_sem);  	init_rwsem(&fs_info->subvol_sem); +	fs_info->dev_replace.lock_owner = 0; +	atomic_set(&fs_info->dev_replace.nesting_level, 0); +	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); +	mutex_init(&fs_info->dev_replace.lock_management_lock); +	mutex_init(&fs_info->dev_replace.lock);  	spin_lock_init(&fs_info->qgroup_lock);  	fs_info->qgroup_tree = RB_ROOT; @@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,  			   fs_info->thread_pool_size,  			   &fs_info->generic_worker); +	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", +			   fs_info->thread_pool_size, +			   &fs_info->generic_worker); +  	btrfs_init_workers(&fs_info->submit_workers, "submit",  			   min_t(u64, fs_devices->num_devices,  			   fs_info->thread_pool_size), @@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,  	ret |= btrfs_start_workers(&fs_info->delayed_workers);  	ret |= btrfs_start_workers(&fs_info->caching_workers);  	ret |= btrfs_start_workers(&fs_info->readahead_workers); +	ret |= btrfs_start_workers(&fs_info->flush_workers);  	if (ret) {  		err = -ENOMEM;  		goto fail_sb_buffer; @@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,  		goto fail_tree_roots;  	} -	btrfs_close_extra_devices(fs_devices); +	/* +	 * keep the device that is marked to be the target device for the +	 * dev_replace procedure +	 */ +	btrfs_close_extra_devices(fs_info, fs_devices, 0);  	if (!fs_devices->latest_bdev) {  		printk(KERN_CRIT "btrfs: failed to read devices on %s\n", @@ -2490,6 +2517,14 @@ retry_root_backup:  		goto fail_block_groups;  	} +	ret = btrfs_init_dev_replace(fs_info); +	if (ret) { +		pr_err("btrfs: failed to init dev_replace: %d\n", ret); +		goto fail_block_groups; +	} + +	btrfs_close_extra_devices(fs_info, fs_devices, 1); +  	ret = btrfs_init_space_info(fs_info);  	if (ret) {  		printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2503,6 +2538,13 @@ retry_root_backup:  	}  	fs_info->num_tolerated_disk_barrier_failures =  		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); +	if (fs_info->fs_devices->missing_devices > +	     fs_info->num_tolerated_disk_barrier_failures && +	    !(sb->s_flags & MS_RDONLY)) { +		printk(KERN_WARNING +		       "Btrfs: too many missing devices, writeable mount is not allowed\n"); +		goto fail_block_groups; +	}  	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,  					       "btrfs-cleaner"); @@ -2631,6 +2673,13 @@ retry_root_backup:  		return ret;  	} +	ret = btrfs_resume_dev_replace_async(fs_info); +	if (ret) { +		pr_warn("btrfs: failed to resume dev_replace\n"); +		close_ctree(tree_root); +		return ret; +	} +  	return 0;  fail_qgroup: @@ -2667,6 +2716,7 @@ fail_sb_buffer:  	btrfs_stop_workers(&fs_info->submit_workers);  	btrfs_stop_workers(&fs_info->delayed_workers);  	btrfs_stop_workers(&fs_info->caching_workers); +	btrfs_stop_workers(&fs_info->flush_workers);  fail_alloc:  fail_iput:  	btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)  	smp_mb();  	/* pause restriper - we want to resume on mount */ -	btrfs_pause_balance(root->fs_info); +	btrfs_pause_balance(fs_info); + +	btrfs_dev_replace_suspend_for_unmount(fs_info); -	btrfs_scrub_cancel(root); +	btrfs_scrub_cancel(fs_info);  	/* wait for any defraggers to finish */  	wait_event(fs_info->transaction_wait,  		   (atomic_read(&fs_info->defrag_running) == 0));  	/* clear out the rbtree of defraggable inodes */ -	btrfs_run_defrag_inodes(fs_info); +	btrfs_cleanup_defrag_inodes(fs_info);  	if (!(fs_info->sb->s_flags & MS_RDONLY)) {  		ret = btrfs_commit_super(root); @@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)  	btrfs_stop_workers(&fs_info->delayed_workers);  	btrfs_stop_workers(&fs_info->caching_workers);  	btrfs_stop_workers(&fs_info->readahead_workers); +	btrfs_stop_workers(&fs_info->flush_workers);  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY  	if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  	int was_dirty;  	btrfs_assert_tree_locked(buf); -	if (transid != root->fs_info->generation) { -		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " +	if (transid != root->fs_info->generation) +		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "  		       "found %llu running %llu\n",  			(unsigned long long)buf->start,  			(unsigned long long)transid,  			(unsigned long long)root->fs_info->generation); -		WARN_ON(1); -	}  	was_dirty = set_extent_buffer_dirty(buf);  	if (!was_dirty) {  		spin_lock(&root->fs_info->delalloc_lock); @@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)  	}  } -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +static void __btrfs_btree_balance_dirty(struct btrfs_root *root, +					int flush_delayed)  {  	/*  	 * looks as though older kernels can get into trouble with @@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)  	if (current->flags & PF_MEMALLOC)  		return; -	btrfs_balance_delayed_items(root); +	if (flush_delayed) +		btrfs_balance_delayed_items(root);  	num_dirty = root->fs_info->dirty_metadata_bytes;  	if (num_dirty > thresh) { -		balance_dirty_pages_ratelimited_nr( -				   root->fs_info->btree_inode->i_mapping, 1); +		balance_dirty_pages_ratelimited( +				   root->fs_info->btree_inode->i_mapping);  	}  	return;  } -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +void btrfs_btree_balance_dirty(struct btrfs_root *root)  { -	/* -	 * looks as though older kernels can get into trouble with -	 * this code, they end up stuck in balance_dirty_pages forever -	 */ -	u64 num_dirty; -	unsigned long thresh = 32 * 1024 * 1024; - -	if (current->flags & PF_MEMALLOC) -		return; - -	num_dirty = root->fs_info->dirty_metadata_bytes; +	__btrfs_btree_balance_dirty(root, 1); +} -	if (num_dirty > thresh) { -		balance_dirty_pages_ratelimited_nr( -				   root->fs_info->btree_inode->i_mapping, 1); -	} -	return; +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +{ +	__btrfs_btree_balance_dirty(root, 0);  }  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2025a9132c16..305c33efb0e3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,  struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,  					      struct btrfs_key *location);  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +void btrfs_btree_balance_dirty(struct btrfs_root *root); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);  void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d3e2c17d8d1..521e9d4424f6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@  #include "volumes.h"  #include "locking.h"  #include "free-space-cache.h" +#include "math.h"  #undef SCRAMBLE_DELAYED_REFS @@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)  	rcu_read_unlock();  } -static u64 div_factor(u64 num, int factor) -{ -	if (factor == 10) -		return num; -	num *= factor; -	do_div(num, 10); -	return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ -	if (factor == 100) -		return num; -	num *= factor; -	do_div(num, 100); -	return num; -} -  u64 btrfs_find_block_group(struct btrfs_root *root,  			   u64 search_start, u64 search_hint, int owner)  { @@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  	/* Tell the block device(s) that the sectors can be discarded */ -	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, +	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,  			      bytenr, &num_bytes, &bbio, 0);  	/* Error condition is -ENOMEM */  	if (!ret) { @@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  				kfree(extent_op);  				if (ret) { +					list_del_init(&locked_ref->cluster); +					mutex_unlock(&locked_ref->mutex); +  					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);  					spin_lock(&delayed_refs->lock);  					return ret; @@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,  		count++;  		if (ret) { +			if (locked_ref) { +				list_del_init(&locked_ref->cluster); +				mutex_unlock(&locked_ref->mutex); +			}  			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);  			spin_lock(&delayed_refs->lock);  			return ret; @@ -3661,7 +3651,7 @@ out:  static int can_overcommit(struct btrfs_root *root,  			  struct btrfs_space_info *space_info, u64 bytes, -			  int flush) +			  enum btrfs_reserve_flush_enum flush)  {  	u64 profile = btrfs_get_alloc_profile(root, 0);  	u64 avail; @@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,  		avail >>= 1;  	/* -	 * If we aren't flushing don't let us overcommit too much, say -	 * 1/8th of the space.  If we can flush, let it overcommit up to -	 * 1/2 of the space. +	 * If we aren't flushing all things, let us overcommit up to +	 * 1/2th of the space. If we can flush, don't let us overcommit +	 * too much, let it overcommit up to 1/8 of the space.  	 */ -	if (flush) +	if (flush == BTRFS_RESERVE_FLUSH_ALL)  		avail >>= 3;  	else  		avail >>= 1; @@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,  	return 0;  } +static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, +					       unsigned long nr_pages, +					       enum wb_reason reason) +{ +	if (!writeback_in_progress(sb->s_bdi) && +	    down_read_trylock(&sb->s_umount)) { +		writeback_inodes_sb_nr(sb, nr_pages, reason); +		up_read(&sb->s_umount); +		return 1; +	} + +	return 0; +} +  /*   * shrink metadata reservation for delalloc   */ @@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	long time_left;  	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;  	int loops = 0; +	enum btrfs_reserve_flush_enum flush;  	trans = (struct btrfs_trans_handle *)current->journal_info;  	block_rsv = &root->fs_info->delalloc_block_rsv; @@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	while (delalloc_bytes && loops < 3) {  		max_reclaim = min(delalloc_bytes, to_reclaim);  		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; -		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, -					       WB_REASON_FS_FREE_SPACE); +		writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, +						    nr_pages, +						    WB_REASON_FS_FREE_SPACE);  		/*  		 * We need to wait for the async pages to actually start before @@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  		wait_event(root->fs_info->async_submit_wait,  			   !atomic_read(&root->fs_info->async_delalloc_pages)); +		if (!trans) +			flush = BTRFS_RESERVE_FLUSH_ALL; +		else +			flush = BTRFS_RESERVE_NO_FLUSH;  		spin_lock(&space_info->lock); -		if (can_overcommit(root, space_info, orig, !trans)) { +		if (can_overcommit(root, space_info, orig, flush)) {  			spin_unlock(&space_info->lock);  			break;  		} @@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,   * @root - the root we're allocating for   * @block_rsv - the block_rsv we're allocating for   * @orig_bytes - the number of bytes we want - * @flush - wether or not we can flush to make our reservation + * @flush - whether or not we can flush to make our reservation   *   * This will reserve orgi_bytes number of bytes from the space info associated   * with the block_rsv.  If there is not enough space it will make an attempt to @@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,   */  static int reserve_metadata_bytes(struct btrfs_root *root,  				  struct btrfs_block_rsv *block_rsv, -				  u64 orig_bytes, int flush) +				  u64 orig_bytes, +				  enum btrfs_reserve_flush_enum flush)  {  	struct btrfs_space_info *space_info = block_rsv->space_info;  	u64 used; @@ -3912,10 +3923,11 @@ again:  	ret = 0;  	spin_lock(&space_info->lock);  	/* -	 * We only want to wait if somebody other than us is flushing and we are -	 * actually alloed to flush. +	 * We only want to wait if somebody other than us is flushing and we +	 * are actually allowed to flush all things.  	 */ -	while (flush && !flushing && space_info->flush) { +	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && +	       space_info->flush) {  		spin_unlock(&space_info->lock);  		/*  		 * If we have a trans handle we can't wait because the flusher @@ -3981,23 +3993,40 @@ again:  	 * Couldn't make our reservation, save our place so while we're trying  	 * to reclaim space we can actually use it instead of somebody else  	 * stealing it from us. +	 * +	 * We make the other tasks wait for the flush only when we can flush +	 * all things.  	 */ -	if (ret && flush) { +	if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {  		flushing = true;  		space_info->flush = 1;  	}  	spin_unlock(&space_info->lock); -	if (!ret || !flush) +	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)  		goto out;  	ret = flush_space(root, space_info, num_bytes, orig_bytes,  			  flush_state);  	flush_state++; + +	/* +	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock +	 * would happen. So skip delalloc flush. +	 */ +	if (flush == BTRFS_RESERVE_FLUSH_LIMIT && +	    (flush_state == FLUSH_DELALLOC || +	     flush_state == FLUSH_DELALLOC_WAIT)) +		flush_state = ALLOC_CHUNK; +  	if (!ret)  		goto again; -	else if (flush_state <= COMMIT_TRANS) +	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && +		 flush_state < COMMIT_TRANS) +		goto again; +	else if (flush == BTRFS_RESERVE_FLUSH_ALL && +		 flush_state <= COMMIT_TRANS)  		goto again;  out: @@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,  	kfree(rsv);  } -static inline int __block_rsv_add(struct btrfs_root *root, -				  struct btrfs_block_rsv *block_rsv, -				  u64 num_bytes, int flush) +int btrfs_block_rsv_add(struct btrfs_root *root, +			struct btrfs_block_rsv *block_rsv, u64 num_bytes, +			enum btrfs_reserve_flush_enum flush)  {  	int ret; @@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,  	return ret;  } -int btrfs_block_rsv_add(struct btrfs_root *root, -			struct btrfs_block_rsv *block_rsv, -			u64 num_bytes) -{ -	return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, -				struct btrfs_block_rsv *block_rsv, -				u64 num_bytes) -{ -	return __block_rsv_add(root, block_rsv, num_bytes, 0); -} -  int btrfs_block_rsv_check(struct btrfs_root *root,  			  struct btrfs_block_rsv *block_rsv, int min_factor)  { @@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,  	return ret;  } -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, -					   struct btrfs_block_rsv *block_rsv, -					   u64 min_reserved, int flush) +int btrfs_block_rsv_refill(struct btrfs_root *root, +			   struct btrfs_block_rsv *block_rsv, u64 min_reserved, +			   enum btrfs_reserve_flush_enum flush)  {  	u64 num_bytes = 0;  	int ret = -ENOSPC; @@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,  	return ret;  } -int btrfs_block_rsv_refill(struct btrfs_root *root, -			   struct btrfs_block_rsv *block_rsv, -			   u64 min_reserved) -{ -	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, -				   struct btrfs_block_rsv *block_rsv, -				   u64 min_reserved) -{ -	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} -  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,  			    struct btrfs_block_rsv *dst_rsv,  			    u64 num_bytes) @@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	u64 csum_bytes;  	unsigned nr_extents = 0;  	int extra_reserve = 0; -	int flush = 1; +	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;  	int ret; +	bool delalloc_lock = true; -	/* Need to be holding the i_mutex here if we aren't free space cache */ -	if (btrfs_is_free_space_inode(inode)) -		flush = 0; +	/* If we are a free space inode we need to not flush since we will be in +	 * the middle of a transaction commit.  We also don't need the delalloc +	 * mutex since we won't race with anybody.  We need this mostly to make +	 * lockdep shut its filthy mouth. +	 */ +	if (btrfs_is_free_space_inode(inode)) { +		flush = BTRFS_RESERVE_NO_FLUSH; +		delalloc_lock = false; +	} -	if (flush && btrfs_transaction_in_commit(root->fs_info)) +	if (flush != BTRFS_RESERVE_NO_FLUSH && +	    btrfs_transaction_in_commit(root->fs_info))  		schedule_timeout(1); -	mutex_lock(&BTRFS_I(inode)->delalloc_mutex); +	if (delalloc_lock) +		mutex_lock(&BTRFS_I(inode)->delalloc_mutex); +  	num_bytes = ALIGN(num_bytes, root->sectorsize);  	spin_lock(&BTRFS_I(inode)->lock); @@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  		ret = btrfs_qgroup_reserve(root, num_bytes +  					   nr_extents * root->leafsize);  		if (ret) { -			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); +			spin_lock(&BTRFS_I(inode)->lock); +			calc_csum_metadata_size(inode, num_bytes, 0); +			spin_unlock(&BTRFS_I(inode)->lock); +			if (delalloc_lock) +				mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);  			return ret;  		}  	} @@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  						      btrfs_ino(inode),  						      to_free, 0);  		} -		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); +		if (root->fs_info->quota_enabled) { +			btrfs_qgroup_free(root, num_bytes + +						nr_extents * root->leafsize); +		} +		if (delalloc_lock) +			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);  		return ret;  	} @@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)  	}  	BTRFS_I(inode)->reserved_extents += nr_extents;  	spin_unlock(&BTRFS_I(inode)->lock); -	mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + +	if (delalloc_lock) +		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);  	if (to_reserve)  		trace_btrfs_space_reservation(root->fs_info,"delalloc", @@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  {  	struct btrfs_fs_info *fs_info = root->fs_info;  	struct btrfs_block_group_cache *cache = NULL; +	struct btrfs_space_info *space_info; +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;  	u64 len; +	bool readonly;  	while (start <= end) { +		readonly = false;  		if (!cache ||  		    start >= cache->key.objectid + cache->key.offset) {  			if (cache) @@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)  		}  		start += len; +		space_info = cache->space_info; -		spin_lock(&cache->space_info->lock); +		spin_lock(&space_info->lock);  		spin_lock(&cache->lock);  		cache->pinned -= len; -		cache->space_info->bytes_pinned -= len; -		if (cache->ro) -			cache->space_info->bytes_readonly += len; +		space_info->bytes_pinned -= len; +		if (cache->ro) { +			space_info->bytes_readonly += len; +			readonly = true; +		}  		spin_unlock(&cache->lock); -		spin_unlock(&cache->space_info->lock); +		if (!readonly && global_rsv->space_info == space_info) { +			spin_lock(&global_rsv->lock); +			if (!global_rsv->full) { +				len = min(len, global_rsv->size - +					  global_rsv->reserved); +				global_rsv->reserved += len; +				space_info->bytes_may_use += len; +				if (global_rsv->reserved >= global_rsv->size) +					global_rsv->full = 1; +			} +			spin_unlock(&global_rsv->lock); +		} +		spin_unlock(&space_info->lock);  	}  	if (cache) @@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)  	return 0;  } -static int __get_block_group_index(u64 flags) +int __get_raid_index(u64 flags)  {  	int index; @@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)  static int get_block_group_index(struct btrfs_block_group_cache *cache)  { -	return __get_block_group_index(cache->flags); +	return __get_raid_index(cache->flags);  }  enum btrfs_loop_type { @@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,  	block_rsv = get_block_rsv(trans, root);  	if (block_rsv->size == 0) { -		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); +		ret = reserve_metadata_bytes(root, block_rsv, blocksize, +					     BTRFS_RESERVE_NO_FLUSH);  		/*  		 * If we couldn't reserve metadata bytes try and use some from  		 * the global reserve. @@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,  		static DEFINE_RATELIMIT_STATE(_rs,  				DEFAULT_RATELIMIT_INTERVAL,  				/*DEFAULT_RATELIMIT_BURST*/ 2); -		if (__ratelimit(&_rs)) { -			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); -			WARN_ON(1); -		} -		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); +		if (__ratelimit(&_rs)) +			WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", +			     ret); +		ret = reserve_metadata_bytes(root, block_rsv, blocksize, +					     BTRFS_RESERVE_NO_FLUSH);  		if (!ret) {  			return block_rsv;  		} else if (ret && block_rsv != global_rsv) { @@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  	 */  	target = get_restripe_target(root->fs_info, block_group->flags);  	if (target) { -		index = __get_block_group_index(extended_to_chunk(target)); +		index = __get_raid_index(extended_to_chunk(target));  	} else {  		/*  		 * this is just a balance, so if we were marked as full @@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)  		 * check to make sure we can actually find a chunk with enough  		 * space to fit our block group in.  		 */ -		if (device->total_bytes > device->bytes_used + min_free) { +		if (device->total_bytes > device->bytes_used + min_free && +		    !device->is_tgtdev_for_dev_replace) {  			ret = find_free_dev_extent(device, min_free,  						   &dev_offset, NULL);  			if (!ret) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 472873a94d96..1b319df29eee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,  {  	struct rb_node *node; -	if (end < start) { -		printk(KERN_ERR "btrfs end < start %llu %llu\n", +	if (end < start) +		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",  		       (unsigned long long)end,  		       (unsigned long long)start); -		WARN_ON(1); -	}  	state->start = start;  	state->end = end; @@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)   * the standard behavior is to write all copies in a raid setup. here we only   * want to write the one bad copy. so we do the mapping for ourselves and issue   * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which + * to avoid any synchronization issues, wait for the data after writing, which   * actually prevents the read that triggered the error from finishing.   * currently, there can be no more than two copies of every data bit. thus,   * exactly one rewrite is required.   */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,  			u64 length, u64 logical, struct page *page,  			int mirror_num)  { @@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,  	bio->bi_size = 0;  	map_length = length; -	ret = btrfs_map_block(map_tree, WRITE, logical, +	ret = btrfs_map_block(fs_info, WRITE, logical,  			      &map_length, &bbio, mirror_num);  	if (ret) {  		bio_put(bio); @@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,  int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,  			 int mirror_num)  { -	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;  	u64 start = eb->start;  	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);  	int ret = 0;  	for (i = 0; i < num_pages; i++) {  		struct page *p = extent_buffer_page(eb, i); -		ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, +		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,  					start, p, mirror_num);  		if (ret)  			break; @@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)  	u64 private;  	u64 private_failure;  	struct io_failure_record *failrec; -	struct btrfs_mapping_tree *map_tree; +	struct btrfs_fs_info *fs_info;  	struct extent_state *state;  	int num_copies;  	int did_repair = 0; @@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)  	spin_unlock(&BTRFS_I(inode)->io_tree.lock);  	if (state && state->start == failrec->start) { -		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; -		num_copies = btrfs_num_copies(map_tree, failrec->logical, -						failrec->len); +		fs_info = BTRFS_I(inode)->root->fs_info; +		num_copies = btrfs_num_copies(fs_info, failrec->logical, +					      failrec->len);  		if (num_copies > 1)  { -			ret = repair_io_failure(map_tree, start, failrec->len, +			ret = repair_io_failure(fs_info, start, failrec->len,  						failrec->logical, page,  						failrec->failed_mirror);  			did_repair = !ret; @@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,  		 * clean_io_failure() clean all those errors at once.  		 */  	} -	num_copies = btrfs_num_copies( -			      &BTRFS_I(inode)->root->fs_info->mapping_tree, -			      failrec->logical, failrec->len); +	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, +				      failrec->logical, failrec->len);  	if (num_copies == 1) {  		/*  		 * we only have a single copy of the data, so don't bother with @@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,  	return bio;  } -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */  static int __must_check submit_one_bio(int rw, struct bio *bio,  				       int mirror_num, unsigned long bio_flags)  { @@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,  	}  	if (start + min_len > eb->len) { -		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " +		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "  		       "wanted %lu %lu\n", (unsigned long long)eb->start,  		       eb->len, start, min_len); -		WARN_ON(1);  		return -EINVAL;  	} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 711d12b80028..2eacfabd3263 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -337,9 +337,9 @@ struct bio *  btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,  		gfp_t gfp_flags); -struct btrfs_mapping_tree; +struct btrfs_fs_info; -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, +int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,  			u64 length, u64 logical, struct page *page,  			int mirror_num);  int end_extent_writepage(struct page *page, int err, u64 start, u64 end); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b8cbc8d5c7f7..f169d6b11d7f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)  struct extent_map *alloc_extent_map(void)  {  	struct extent_map *em; -	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); +	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);  	if (!em)  		return NULL;  	em->in_tree = 0; @@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)  			merge = rb_entry(rb, struct extent_map, rb_node);  		if (rb && mergable_maps(merge, em)) {  			em->start = merge->start; +			em->orig_start = merge->orig_start;  			em->len += merge->len;  			em->block_len += merge->block_len;  			em->block_start = merge->block_start;  			merge->in_tree = 0; -			if (merge->generation > em->generation) { -				em->mod_start = em->start; -				em->mod_len = em->len; -				em->generation = merge->generation; -				list_move(&em->list, &tree->modified_extents); -			} +			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; +			em->mod_start = merge->mod_start; +			em->generation = max(em->generation, merge->generation); +			list_move(&em->list, &tree->modified_extents);  			list_del_init(&merge->list);  			rb_erase(&merge->rb_node, &tree->map); @@ -223,23 +222,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)  		em->block_len += merge->len;  		rb_erase(&merge->rb_node, &tree->map);  		merge->in_tree = 0; -		if (merge->generation > em->generation) { -			em->mod_len = em->len; -			em->generation = merge->generation; -			list_move(&em->list, &tree->modified_extents); -		} +		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; +		em->generation = max(em->generation, merge->generation);  		list_del_init(&merge->list);  		free_extent_map(merge);  	}  }  /** - * unpint_extent_cache - unpin an extent from the cache + * unpin_extent_cache - unpin an extent from the cache   * @tree:	tree to unpin the extent in   * @start:	logical offset in the file   * @len:	length of the extent   * @gen:	generation that this extent has been modified in - * @prealloc:	if this is set we need to clear the prealloc flag   *   * Called after an extent has been written to disk properly.  Set the generation   * to the generation that actually added the file item to the inode so we know @@ -266,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,  	em->mod_start = em->start;  	em->mod_len = em->len; -	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { +	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {  		prealloc = true; -		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		clear_bit(EXTENT_FLAG_FILLING, &em->flags);  	}  	try_merge_map(tree, em); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 679225555f7b..922943ce29e8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -14,6 +14,7 @@  #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */  #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */  #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ +#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */  struct extent_map {  	struct rb_node rb_node; @@ -24,6 +25,7 @@ struct extent_map {  	u64 mod_start;  	u64 mod_len;  	u64 orig_start; +	u64 orig_block_len;  	u64 block_start;  	u64 block_len;  	u64 generation; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ad08e4e4a15..bd38cef42358 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -133,7 +133,6 @@ fail:  	return ERR_PTR(ret);  } -  int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,  			     struct btrfs_root *root,  			     struct btrfs_path *path, u64 objectid, @@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,  	return ret;  } +u64 btrfs_file_extent_length(struct btrfs_path *path) +{ +	int extent_type; +	struct btrfs_file_extent_item *fi; +	u64 len; + +	fi = btrfs_item_ptr(path->nodes[0], path->slots[0], +			    struct btrfs_file_extent_item); +	extent_type = btrfs_file_extent_type(path->nodes[0], fi); + +	if (extent_type == BTRFS_FILE_EXTENT_REG || +	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) +		len = btrfs_file_extent_num_bytes(path->nodes[0], fi); +	else if (extent_type == BTRFS_FILE_EXTENT_INLINE) +		len = btrfs_file_extent_inline_len(path->nodes[0], fi); +	else +		BUG(); + +	return len; +}  static int __btrfs_lookup_bio_sums(struct btrfs_root *root,  				   struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9ab1bed88116..77061bf43edb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,7 @@  #include "compat.h"  #include "volumes.h" +static struct kmem_cache *btrfs_inode_defrag_cachep;  /*   * when auto defrag is enabled we   * queue up these defrag structs to remember which @@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,   * If an existing record is found the defrag item you   * pass in is freed   */ -static void __btrfs_add_inode_defrag(struct inode *inode, +static int __btrfs_add_inode_defrag(struct inode *inode,  				    struct inode_defrag *defrag)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; @@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,  				entry->transid = defrag->transid;  			if (defrag->last_offset > entry->last_offset)  				entry->last_offset = defrag->last_offset; -			goto exists; +			return -EEXIST;  		}  	}  	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);  	rb_link_node(&defrag->rb_node, parent, p);  	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); -	return; +	return 0; +} -exists: -	kfree(defrag); -	return; +static inline int __need_auto_defrag(struct btrfs_root *root) +{ +	if (!btrfs_test_opt(root, AUTO_DEFRAG)) +		return 0; + +	if (btrfs_fs_closing(root->fs_info)) +		return 0; +	return 1;  }  /* @@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct inode_defrag *defrag;  	u64 transid; +	int ret; -	if (!btrfs_test_opt(root, AUTO_DEFRAG)) -		return 0; - -	if (btrfs_fs_closing(root->fs_info)) +	if (!__need_auto_defrag(root))  		return 0;  	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  	else  		transid = BTRFS_I(inode)->root->last_trans; -	defrag = kzalloc(sizeof(*defrag), GFP_NOFS); +	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);  	if (!defrag)  		return -ENOMEM; @@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,  	defrag->root = root->root_key.objectid;  	spin_lock(&root->fs_info->defrag_inodes_lock); -	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) -		__btrfs_add_inode_defrag(inode, defrag); -	else -		kfree(defrag); +	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { +		/* +		 * If we set IN_DEFRAG flag and evict the inode from memory, +		 * and then re-read this inode, this new inode doesn't have +		 * IN_DEFRAG flag. At the case, we may find the existed defrag. +		 */ +		ret = __btrfs_add_inode_defrag(inode, defrag); +		if (ret) +			kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +	} else { +		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +	}  	spin_unlock(&root->fs_info->defrag_inodes_lock);  	return 0;  }  /* - * must be called with the defrag_inodes lock held + * Requeue the defrag object. If there is a defrag object that points to + * the same inode in the tree, we will merge them together (by + * __btrfs_add_inode_defrag()) and free the one that we want to requeue.   */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, -					     u64 root, u64 ino, -					     struct rb_node **next) +void btrfs_requeue_inode_defrag(struct inode *inode, +				struct inode_defrag *defrag) +{ +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	if (!__need_auto_defrag(root)) +		goto out; + +	/* +	 * Here we don't check the IN_DEFRAG flag, because we need merge +	 * them together. +	 */ +	spin_lock(&root->fs_info->defrag_inodes_lock); +	ret = __btrfs_add_inode_defrag(inode, defrag); +	spin_unlock(&root->fs_info->defrag_inodes_lock); +	if (ret) +		goto out; +	return; +out: +	kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +} + +/* + * pick the defragable inode that we want, if it doesn't exist, we will get + * the next one. + */ +static struct inode_defrag * +btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)  {  	struct inode_defrag *entry = NULL;  	struct inode_defrag tmp; @@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,  	tmp.ino = ino;  	tmp.root = root; -	p = info->defrag_inodes.rb_node; +	spin_lock(&fs_info->defrag_inodes_lock); +	p = fs_info->defrag_inodes.rb_node;  	while (p) {  		parent = p;  		entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,  		else if (ret > 0)  			p = parent->rb_right;  		else -			return entry; +			goto out;  	} -	if (next) { -		while (parent && __compare_inode_defrag(&tmp, entry) > 0) { -			parent = rb_next(parent); +	if (parent && __compare_inode_defrag(&tmp, entry) > 0) { +		parent = rb_next(parent); +		if (parent)  			entry = rb_entry(parent, struct inode_defrag, rb_node); -		} -		*next = parent; +		else +			entry = NULL;  	} -	return NULL; +out: +	if (entry) +		rb_erase(parent, &fs_info->defrag_inodes); +	spin_unlock(&fs_info->defrag_inodes_lock); +	return entry;  } -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)  {  	struct inode_defrag *defrag; +	struct rb_node *node; + +	spin_lock(&fs_info->defrag_inodes_lock); +	node = rb_first(&fs_info->defrag_inodes); +	while (node) { +		rb_erase(node, &fs_info->defrag_inodes); +		defrag = rb_entry(node, struct inode_defrag, rb_node); +		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + +		if (need_resched()) { +			spin_unlock(&fs_info->defrag_inodes_lock); +			cond_resched(); +			spin_lock(&fs_info->defrag_inodes_lock); +		} + +		node = rb_first(&fs_info->defrag_inodes); +	} +	spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH	1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, +				    struct inode_defrag *defrag) +{  	struct btrfs_root *inode_root;  	struct inode *inode; -	struct rb_node *n;  	struct btrfs_key key;  	struct btrfs_ioctl_defrag_range_args range; -	u64 first_ino = 0; -	u64 root_objectid = 0;  	int num_defrag; -	int defrag_batch = 1024; +	/* get the inode */ +	key.objectid = defrag->root; +	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); +	key.offset = (u64)-1; +	inode_root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(inode_root)) { +		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +		return PTR_ERR(inode_root); +	} + +	key.objectid = defrag->ino; +	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); +	key.offset = 0; +	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); +	if (IS_ERR(inode)) { +		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +		return PTR_ERR(inode); +	} + +	/* do a chunk of defrag */ +	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);  	memset(&range, 0, sizeof(range));  	range.len = (u64)-1; +	range.start = defrag->last_offset; + +	sb_start_write(fs_info->sb); +	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, +				       BTRFS_DEFRAG_BATCH); +	sb_end_write(fs_info->sb); +	/* +	 * if we filled the whole defrag batch, there +	 * must be more work to do.  Queue this defrag +	 * again +	 */ +	if (num_defrag == BTRFS_DEFRAG_BATCH) { +		defrag->last_offset = range.start; +		btrfs_requeue_inode_defrag(inode, defrag); +	} else if (defrag->last_offset && !defrag->cycled) { +		/* +		 * we didn't fill our defrag batch, but +		 * we didn't start at zero.  Make sure we loop +		 * around to the start of the file. +		 */ +		defrag->last_offset = 0; +		defrag->cycled = 1; +		btrfs_requeue_inode_defrag(inode, defrag); +	} else { +		kmem_cache_free(btrfs_inode_defrag_cachep, defrag); +	} + +	iput(inode); +	return 0; +} + +/* + * run through the list of inodes in the FS that need + * defragging + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ +	struct inode_defrag *defrag; +	u64 first_ino = 0; +	u64 root_objectid = 0;  	atomic_inc(&fs_info->defrag_running); -	spin_lock(&fs_info->defrag_inodes_lock);  	while(1) { -		n = NULL; +		if (!__need_auto_defrag(fs_info->tree_root)) +			break;  		/* find an inode to defrag */ -		defrag = btrfs_find_defrag_inode(fs_info, root_objectid, -						 first_ino, &n); +		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, +						 first_ino);  		if (!defrag) { -			if (n) { -				defrag = rb_entry(n, struct inode_defrag, -						  rb_node); -			} else if (root_objectid || first_ino) { +			if (root_objectid || first_ino) {  				root_objectid = 0;  				first_ino = 0;  				continue; @@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)  			}  		} -		/* remove it from the rbtree */  		first_ino = defrag->ino + 1;  		root_objectid = defrag->root; -		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - -		if (btrfs_fs_closing(fs_info)) -			goto next_free; - -		spin_unlock(&fs_info->defrag_inodes_lock); - -		/* get the inode */ -		key.objectid = defrag->root; -		btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); -		key.offset = (u64)-1; -		inode_root = btrfs_read_fs_root_no_name(fs_info, &key); -		if (IS_ERR(inode_root)) -			goto next; - -		key.objectid = defrag->ino; -		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); -		key.offset = 0; - -		inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); -		if (IS_ERR(inode)) -			goto next; -		/* do a chunk of defrag */ -		clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); -		range.start = defrag->last_offset; -		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, -					       defrag_batch); -		/* -		 * if we filled the whole defrag batch, there -		 * must be more work to do.  Queue this defrag -		 * again -		 */ -		if (num_defrag == defrag_batch) { -			defrag->last_offset = range.start; -			__btrfs_add_inode_defrag(inode, defrag); -			/* -			 * we don't want to kfree defrag, we added it back to -			 * the rbtree -			 */ -			defrag = NULL; -		} else if (defrag->last_offset && !defrag->cycled) { -			/* -			 * we didn't fill our defrag batch, but -			 * we didn't start at zero.  Make sure we loop -			 * around to the start of the file. -			 */ -			defrag->last_offset = 0; -			defrag->cycled = 1; -			__btrfs_add_inode_defrag(inode, defrag); -			defrag = NULL; -		} - -		iput(inode); -next: -		spin_lock(&fs_info->defrag_inodes_lock); -next_free: -		kfree(defrag); +		__btrfs_run_defrag_inode(fs_info, defrag);  	} -	spin_unlock(&fs_info->defrag_inodes_lock); -  	atomic_dec(&fs_info->defrag_running);  	/* @@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  				split->block_len = em->block_len;  			else  				split->block_len = split->len; +			split->orig_block_len = max(split->block_len, +						    em->orig_block_len);  			split->generation = gen;  			split->bdev = em->bdev;  			split->flags = flags; @@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			split->flags = flags;  			split->compress_type = em->compress_type;  			split->generation = gen; +			split->orig_block_len = max(em->block_len, +						    em->orig_block_len);  			if (compressed) {  				split->block_len = em->block_len; @@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,  			} else {  				split->block_len = split->len;  				split->block_start = em->block_start + diff; -				split->orig_start = split->start; +				split->orig_start = em->orig_start;  			}  			ret = add_extent_mapping(em_tree, split); @@ -1346,10 +1412,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,  		cond_resched(); -		balance_dirty_pages_ratelimited_nr(inode->i_mapping, -						   dirty_pages); +		balance_dirty_pages_ratelimited(inode->i_mapping);  		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) -			btrfs_btree_balance_dirty(root, 1); +			btrfs_btree_balance_dirty(root);  		pos += copied;  		num_written += copied; @@ -1398,6 +1463,24 @@ out:  	return written ? written : err;  } +static void update_time_for_write(struct inode *inode) +{ +	struct timespec now; + +	if (IS_NOCMTIME(inode)) +		return; + +	now = current_fs_time(inode->i_sb); +	if (!timespec_equal(&inode->i_mtime, &now)) +		inode->i_mtime = now; + +	if (!timespec_equal(&inode->i_ctime, &now)) +		inode->i_ctime = now; + +	if (IS_I_VERSION(inode)) +		inode_inc_iversion(inode); +} +  static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  				    const struct iovec *iov,  				    unsigned long nr_segs, loff_t pos) @@ -1410,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  	ssize_t num_written = 0;  	ssize_t err = 0;  	size_t count, ocount; +	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);  	sb_start_write(inode->i_sb); @@ -1452,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  		goto out;  	} -	err = file_update_time(file); -	if (err) { -		mutex_unlock(&inode->i_mutex); -		goto out; -	} +	/* +	 * We reserve space for updating the inode when we reserve space for the +	 * extent we are going to write, so we will enospc out there.  We don't +	 * need to start yet another transaction to update the inode as we will +	 * update the inode when we finish writing whatever data we write. +	 */ +	update_time_for_write(inode);  	start_pos = round_down(pos, root->sectorsize);  	if (start_pos > i_size_read(inode)) { @@ -1467,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  		}  	} +	if (sync) +		atomic_inc(&BTRFS_I(inode)->sync_writers); +  	if (unlikely(file->f_flags & O_DIRECT)) {  		num_written = __btrfs_direct_write(iocb, iov, nr_segs,  						   pos, ppos, count, ocount); @@ -1493,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,  	 * this will either be one more than the running transaction  	 * or the generation used for the next transaction if there isn't  	 * one running right now. +	 * +	 * We also have to set last_sub_trans to the current log transid, +	 * otherwise subsequent syncs to a file that's been synced in this +	 * transaction will appear to have already occured.  	 */  	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; +	BTRFS_I(inode)->last_sub_trans = root->log_transid;  	if (num_written > 0 || num_written == -EIOCBQUEUED) {  		err = generic_write_sync(file, pos, num_written);  		if (err < 0 && num_written > 0)  			num_written = err;  	}  out: +	if (sync) +		atomic_dec(&BTRFS_I(inode)->sync_writers);  	sb_end_write(inode->i_sb);  	current->backing_dev_info = NULL;  	return num_written ? num_written : err; @@ -1551,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 * out of the ->i_mutex. If so, we can flush the dirty pages by  	 * multi-task, and make the performance up.  	 */ +	atomic_inc(&BTRFS_I(inode)->sync_writers);  	ret = filemap_write_and_wait_range(inode->i_mapping, start, end); +	atomic_dec(&BTRFS_I(inode)->sync_writers);  	if (ret)  		return ret; @@ -1562,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	 * range being left.  	 */  	atomic_inc(&root->log_batch); -	btrfs_wait_ordered_range(inode, start, end); +	btrfs_wait_ordered_range(inode, start, end - start + 1);  	atomic_inc(&root->log_batch);  	/* @@ -1768,6 +1866,7 @@ out:  		hole_em->block_start = EXTENT_MAP_HOLE;  		hole_em->block_len = 0; +		hole_em->orig_block_len = 0;  		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;  		hole_em->compress_type = BTRFS_COMPRESS_NONE;  		hole_em->generation = trans->transid; @@ -1797,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  	struct btrfs_path *path;  	struct btrfs_block_rsv *rsv;  	struct btrfs_trans_handle *trans; -	u64 mask = BTRFS_I(inode)->root->sectorsize - 1; -	u64 lockstart = (offset + mask) & ~mask; -	u64 lockend = ((offset + len) & ~mask) - 1; +	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); +	u64 lockend = round_down(offset + len, +				 BTRFS_I(inode)->root->sectorsize) - 1;  	u64 cur_offset = lockstart;  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);  	u64 drop_end; -	unsigned long nr;  	int ret = 0;  	int err = 0; -	bool same_page = (offset >> PAGE_CACHE_SHIFT) == -		((offset + len) >> PAGE_CACHE_SHIFT); +	bool same_page = ((offset >> PAGE_CACHE_SHIFT) == +			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));  	btrfs_wait_ordered_range(inode, offset, len);  	mutex_lock(&inode->i_mutex); -	if (offset >= inode->i_size) { -		mutex_unlock(&inode->i_mutex); -		return 0; -	} - +	/* +	 * We needn't truncate any page which is beyond the end of the file +	 * because we are sure there is no data there. +	 */  	/*  	 * Only do this if we are in the same page and we aren't doing the  	 * entire page.  	 */  	if (same_page && len < PAGE_CACHE_SIZE) { -		ret = btrfs_truncate_page(inode, offset, len, 0); +		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) +			ret = btrfs_truncate_page(inode, offset, len, 0);  		mutex_unlock(&inode->i_mutex);  		return ret;  	}  	/* zero back part of the first page */ -	ret = btrfs_truncate_page(inode, offset, 0, 0); -	if (ret) { -		mutex_unlock(&inode->i_mutex); -		return ret; +	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { +		ret = btrfs_truncate_page(inode, offset, 0, 0); +		if (ret) { +			mutex_unlock(&inode->i_mutex); +			return ret; +		}  	}  	/* zero the front end of the last page */ -	ret = btrfs_truncate_page(inode, offset + len, 0, 1); -	if (ret) { -		mutex_unlock(&inode->i_mutex); -		return ret; +	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { +		ret = btrfs_truncate_page(inode, offset + len, 0, 1); +		if (ret) { +			mutex_unlock(&inode->i_mutex); +			return ret; +		}  	}  	if (lockend < lockstart) { @@ -1931,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)  			break;  		} -		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root); -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root);  		trans = btrfs_start_transaction(root, 3);  		if (IS_ERR(trans)) { @@ -1964,11 +2065,13 @@ out_trans:  	if (!trans)  		goto out_free; +	inode_inc_iversion(inode); +	inode->i_mtime = inode->i_ctime = CURRENT_TIME; +  	trans->block_rsv = &root->fs_info->trans_block_rsv;  	ret = btrfs_update_inode(trans, root, inode); -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  out_free:  	btrfs_free_path(path);  	btrfs_free_block_rsv(root, rsv); @@ -1992,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,  	u64 alloc_end;  	u64 alloc_hint = 0;  	u64 locked_end; -	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;  	struct extent_map *em; +	int blocksize = BTRFS_I(inode)->root->sectorsize;  	int ret; -	alloc_start = offset & ~mask; -	alloc_end =  (offset + len + mask) & ~mask; +	alloc_start = round_down(offset, blocksize); +	alloc_end = round_up(offset + len, blocksize);  	/* Make sure we aren't being give some crap mode */  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2010,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,  	 * Make sure we have enough space before we do the  	 * allocation.  	 */ -	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); +	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);  	if (ret)  		return ret; @@ -2078,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,  		}  		last_byte = min(extent_map_end(em), alloc_end);  		actual_end = min_t(u64, extent_map_end(em), offset + len); -		last_byte = (last_byte + mask) & ~mask; +		last_byte = ALIGN(last_byte, blocksize);  		if (em->block_start == EXTENT_MAP_HOLE ||  		    (cur_offset >= inode->i_size && @@ -2117,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,  out:  	mutex_unlock(&inode->i_mutex);  	/* Let go of our reservation. */ -	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); +	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);  	return ret;  } -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) +static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_map *em; @@ -2155,7 +2258,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)  	 * before the position we want in case there is outstanding delalloc  	 * going on here.  	 */ -	if (origin == SEEK_HOLE && start != 0) { +	if (whence == SEEK_HOLE && start != 0) {  		if (start <= root->sectorsize)  			em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,  						     root->sectorsize, 0); @@ -2189,13 +2292,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)  				}  			} -			if (origin == SEEK_HOLE) { +			if (whence == SEEK_HOLE) {  				*offset = start;  				free_extent_map(em);  				break;  			}  		} else { -			if (origin == SEEK_DATA) { +			if (whence == SEEK_DATA) {  				if (em->block_start == EXTENT_MAP_DELALLOC) {  					if (start >= inode->i_size) {  						free_extent_map(em); @@ -2232,16 +2335,16 @@ out:  	return ret;  } -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)  {  	struct inode *inode = file->f_mapping->host;  	int ret;  	mutex_lock(&inode->i_mutex); -	switch (origin) { +	switch (whence) {  	case SEEK_END:  	case SEEK_CUR: -		offset = generic_file_llseek(file, offset, origin); +		offset = generic_file_llseek(file, offset, whence);  		goto out;  	case SEEK_DATA:  	case SEEK_HOLE: @@ -2250,7 +2353,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)  			return -ENXIO;  		} -		ret = find_desired_extent(inode, &offset, origin); +		ret = find_desired_extent(inode, &offset, whence);  		if (ret) {  			mutex_unlock(&inode->i_mutex);  			return ret; @@ -2293,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {  	.compat_ioctl	= btrfs_ioctl,  #endif  }; + +void btrfs_auto_defrag_exit(void) +{ +	if (btrfs_inode_defrag_cachep) +		kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int btrfs_auto_defrag_init(void) +{ +	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", +					sizeof(struct inode_defrag), 0, +					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, +					NULL); +	if (!btrfs_inode_defrag_cachep) +		return -ENOMEM; + +	return 0; +} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1027b854b90c..59ea2e4349c9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)  static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)  { -	WARN_ON(io_ctl->cur);  	BUG_ON(io_ctl->index >= io_ctl->num_pages);  	io_ctl->page = io_ctl->pages[io_ctl->index++];  	io_ctl->cur = kmap(io_ctl->page); @@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,  			 * if previous extent entry covers the offset,  			 * we should return it instead of the bitmap entry  			 */ -			n = &entry->offset_index; -			while (1) { -				n = rb_prev(n); -				if (!n) -					break; +			n = rb_prev(&entry->offset_index); +			if (n) {  				prev = rb_entry(n, struct btrfs_free_space,  						offset_index); -				if (!prev->bitmap) { -					if (prev->offset + prev->bytes > offset) -						entry = prev; -					break; -				} +				if (!prev->bitmap && +				    prev->offset + prev->bytes > offset) +					entry = prev;  			}  		}  		return entry; @@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,  	}  	if (entry->bitmap) { -		n = &entry->offset_index; -		while (1) { -			n = rb_prev(n); -			if (!n) -				break; +		n = rb_prev(&entry->offset_index); +		if (n) {  			prev = rb_entry(n, struct btrfs_free_space,  					offset_index); -			if (!prev->bitmap) { -				if (prev->offset + prev->bytes > offset) -					return prev; -				break; -			} +			if (!prev->bitmap && +			    prev->offset + prev->bytes > offset) +				return prev;  		}  		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)  			return entry; @@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)  	u64 bitmap_bytes;  	u64 extent_bytes;  	u64 size = block_group->key.offset; -	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; +	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;  	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);  	BUG_ON(ctl->total_bitmaps > max_bitmaps); @@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,  	 * some block groups are so tiny they can't be enveloped by a bitmap, so  	 * don't even bother to create a bitmap for this  	 */ -	if (BITS_PER_BITMAP * block_group->sectorsize > -	    block_group->key.offset) +	if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)  		return false;  	return true; @@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,  	unsigned long total_found = 0;  	int ret; -	i = offset_to_bit(entry->offset, block_group->sectorsize, +	i = offset_to_bit(entry->offset, ctl->unit,  			  max_t(u64, offset, entry->offset)); -	want_bits = bytes_to_bits(bytes, block_group->sectorsize); -	min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); +	want_bits = bytes_to_bits(bytes, ctl->unit); +	min_bits = bytes_to_bits(min_bytes, ctl->unit);  again:  	found_bits = 0; @@ -2325,23 +2313,22 @@ again:  	total_found += found_bits; -	if (cluster->max_size < found_bits * block_group->sectorsize) -		cluster->max_size = found_bits * block_group->sectorsize; +	if (cluster->max_size < found_bits * ctl->unit) +		cluster->max_size = found_bits * ctl->unit;  	if (total_found < want_bits || cluster->max_size < cont1_bytes) {  		i = next_zero + 1;  		goto again;  	} -	cluster->window_start = start * block_group->sectorsize + -		entry->offset; +	cluster->window_start = start * ctl->unit + entry->offset;  	rb_erase(&entry->offset_index, &ctl->free_space_offset);  	ret = tree_insert_offset(&cluster->root, entry->offset,  				 &entry->offset_index, 1);  	BUG_ON(ret); /* -EEXIST; Logic error */  	trace_btrfs_setup_cluster(block_group, cluster, -				  total_found * block_group->sectorsize, 1); +				  total_found * ctl->unit, 1);  	return 0;  } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b1a1c929ba80..d26f67a59e36 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,  	 * 3 items for pre-allocation  	 */  	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); -	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, -					  trans->bytes_reserved); +	ret = btrfs_block_rsv_add(root, trans->block_rsv, +				  trans->bytes_reserved, +				  BTRFS_RESERVE_NO_FLUSH);  	if (ret)  		goto out;  	trace_btrfs_space_reservation(root->fs_info, "ino_cache", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95542a1b3dfc..67ed24ae86bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;  static struct extent_io_ops btrfs_extent_io_ops;  static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep;  struct kmem_cache *btrfs_trans_handle_cachep;  struct kmem_cache *btrfs_transaction_cachep;  struct kmem_cache *btrfs_path_cachep; @@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,  				   struct page *locked_page,  				   u64 start, u64 end, int *page_started,  				   unsigned long *nr_written, int unlock); +static struct extent_map *create_pinned_em(struct inode *inode, u64 start, +					   u64 len, u64 orig_start, +					   u64 block_start, u64 block_len, +					   u64 orig_block_len, int type);  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,  				     struct inode *inode,  struct inode *dir, @@ -698,14 +703,19 @@ retry:  		em->block_start = ins.objectid;  		em->block_len = ins.offset; +		em->orig_block_len = ins.offset;  		em->bdev = root->fs_info->fs_devices->latest_bdev;  		em->compress_type = async_extent->compress_type;  		set_bit(EXTENT_FLAG_PINNED, &em->flags);  		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); +		em->generation = -1;  		while (1) {  			write_lock(&em_tree->lock);  			ret = add_extent_mapping(em_tree, em); +			if (!ret) +				list_move(&em->list, +					  &em_tree->modified_extents);  			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em); @@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,   * required to start IO on it.  It may be clean and already done with   * IO when we return.   */ -static noinline int cow_file_range(struct inode *inode, -				   struct page *locked_page, -				   u64 start, u64 end, int *page_started, -				   unsigned long *nr_written, -				   int unlock) +static noinline int __cow_file_range(struct btrfs_trans_handle *trans, +				     struct inode *inode, +				     struct btrfs_root *root, +				     struct page *locked_page, +				     u64 start, u64 end, int *page_started, +				     unsigned long *nr_written, +				     int unlock)  { -	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_trans_handle *trans;  	u64 alloc_hint = 0;  	u64 num_bytes;  	unsigned long ram_size; @@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,  	int ret = 0;  	BUG_ON(btrfs_is_free_space_inode(inode)); -	trans = btrfs_join_transaction(root); -	if (IS_ERR(trans)) { -		extent_clear_unlock_delalloc(inode, -			     &BTRFS_I(inode)->io_tree, -			     start, end, locked_page, -			     EXTENT_CLEAR_UNLOCK_PAGE | -			     EXTENT_CLEAR_UNLOCK | -			     EXTENT_CLEAR_DELALLOC | -			     EXTENT_CLEAR_DIRTY | -			     EXTENT_SET_WRITEBACK | -			     EXTENT_END_WRITEBACK); -		return PTR_ERR(trans); -	} -	trans->block_rsv = &root->fs_info->delalloc_block_rsv;  	num_bytes = (end - start + blocksize) & ~(blocksize - 1);  	num_bytes = max(blocksize,  num_bytes);  	disk_num_bytes = num_bytes; -	ret = 0;  	/* if this is a small write inside eof, kick off defrag */  	if (num_bytes < 64 * 1024 && @@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,  		em->block_start = ins.objectid;  		em->block_len = ins.offset; +		em->orig_block_len = ins.offset;  		em->bdev = root->fs_info->fs_devices->latest_bdev;  		set_bit(EXTENT_FLAG_PINNED, &em->flags); +		em->generation = -1;  		while (1) {  			write_lock(&em_tree->lock);  			ret = add_extent_mapping(em_tree, em); +			if (!ret) +				list_move(&em->list, +					  &em_tree->modified_extents);  			write_unlock(&em_tree->lock);  			if (ret != -EEXIST) {  				free_extent_map(em); @@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,  		alloc_hint = ins.objectid + ins.offset;  		start += cur_alloc_size;  	} -	ret = 0;  out: -	btrfs_end_transaction(trans, root); -  	return ret; +  out_unlock:  	extent_clear_unlock_delalloc(inode,  		     &BTRFS_I(inode)->io_tree, @@ -971,6 +969,39 @@ out_unlock:  	goto out;  } +static noinline int cow_file_range(struct inode *inode, +				   struct page *locked_page, +				   u64 start, u64 end, int *page_started, +				   unsigned long *nr_written, +				   int unlock) +{ +	struct btrfs_trans_handle *trans; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	int ret; + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		extent_clear_unlock_delalloc(inode, +			     &BTRFS_I(inode)->io_tree, +			     start, end, locked_page, +			     EXTENT_CLEAR_UNLOCK_PAGE | +			     EXTENT_CLEAR_UNLOCK | +			     EXTENT_CLEAR_DELALLOC | +			     EXTENT_CLEAR_DIRTY | +			     EXTENT_SET_WRITEBACK | +			     EXTENT_END_WRITEBACK); +		return PTR_ERR(trans); +	} +	trans->block_rsv = &root->fs_info->delalloc_block_rsv; + +	ret = __cow_file_range(trans, inode, root, locked_page, start, end, +			       page_started, nr_written, unlock); + +	btrfs_end_transaction(trans, root); + +	return ret; +} +  /*   * work queue call back to started compression on a file and pages   */ @@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,  	u64 extent_offset;  	u64 disk_bytenr;  	u64 num_bytes; +	u64 disk_num_bytes;  	int extent_type;  	int ret, err;  	int type; @@ -1228,6 +1260,8 @@ next_slot:  			extent_offset = btrfs_file_extent_offset(leaf, fi);  			extent_end = found_key.offset +  				btrfs_file_extent_num_bytes(leaf, fi); +			disk_num_bytes = +				btrfs_file_extent_disk_num_bytes(leaf, fi);  			if (extent_end <= start) {  				path->slots[0]++;  				goto next_slot; @@ -1281,9 +1315,9 @@ out_check:  		btrfs_release_path(path);  		if (cow_start != (u64)-1) { -			ret = cow_file_range(inode, locked_page, cow_start, -					found_key.offset - 1, page_started, -					nr_written, 1); +			ret = __cow_file_range(trans, inode, root, locked_page, +					       cow_start, found_key.offset - 1, +					       page_started, nr_written, 1);  			if (ret) {  				btrfs_abort_transaction(trans, root, ret);  				goto error; @@ -1298,16 +1332,21 @@ out_check:  			em = alloc_extent_map();  			BUG_ON(!em); /* -ENOMEM */  			em->start = cur_offset; -			em->orig_start = em->start; +			em->orig_start = found_key.offset - extent_offset;  			em->len = num_bytes;  			em->block_len = num_bytes;  			em->block_start = disk_bytenr; +			em->orig_block_len = disk_num_bytes;  			em->bdev = root->fs_info->fs_devices->latest_bdev;  			set_bit(EXTENT_FLAG_PINNED, &em->flags); -			set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +			set_bit(EXTENT_FLAG_FILLING, &em->flags); +			em->generation = -1;  			while (1) {  				write_lock(&em_tree->lock);  				ret = add_extent_mapping(em_tree, em); +				if (!ret) +					list_move(&em->list, +						  &em_tree->modified_extents);  				write_unlock(&em_tree->lock);  				if (ret != -EEXIST) {  					free_extent_map(em); @@ -1352,8 +1391,9 @@ out_check:  	}  	if (cow_start != (u64)-1) { -		ret = cow_file_range(inode, locked_page, cow_start, end, -				     page_started, nr_written, 1); +		ret = __cow_file_range(trans, inode, root, locked_page, +				       cow_start, end, +				       page_started, nr_written, 1);  		if (ret) {  			btrfs_abort_transaction(trans, root, ret);  			goto error; @@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  			 unsigned long bio_flags)  {  	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; -	struct btrfs_mapping_tree *map_tree;  	u64 logical = (u64)bio->bi_sector << 9;  	u64 length = 0;  	u64 map_length; @@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,  		return 0;  	length = bio->bi_size; -	map_tree = &root->fs_info->mapping_tree;  	map_length = length; -	ret = btrfs_map_block(map_tree, READ, logical, +	ret = btrfs_map_block(root->fs_info, READ, logical,  			      &map_length, NULL, 0); -	/* Will always return 0 or 1 with map_multi == NULL */ +	/* Will always return 0 with map_multi == NULL */  	BUG_ON(ret < 0);  	if (map_length < length + size)  		return 1; @@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,  			  u64 bio_offset)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; -	return btrfs_map_bio(root, rw, bio, mirror_num, 1); +	int ret; + +	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); +	if (ret) +		bio_endio(bio, ret); +	return ret;  }  /* @@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	int ret = 0;  	int skip_sum;  	int metadata = 0; +	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  	if (!(rw & REQ_WRITE)) {  		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);  		if (ret) -			return ret; +			goto out;  		if (bio_flags & EXTENT_BIO_COMPRESSED) { -			return btrfs_submit_compressed_read(inode, bio, -						    mirror_num, bio_flags); +			ret = btrfs_submit_compressed_read(inode, bio, +							   mirror_num, +							   bio_flags); +			goto out;  		} else if (!skip_sum) {  			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);  			if (ret) -				return ret; +				goto out;  		}  		goto mapit; -	} else if (!skip_sum) { +	} else if (async && !skip_sum) {  		/* csum items have already been cloned */  		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)  			goto mapit;  		/* we're doing a write, do the async checksumming */ -		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, +		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,  				   inode, rw, bio, mirror_num,  				   bio_flags, bio_offset,  				   __btrfs_submit_bio_start,  				   __btrfs_submit_bio_done); +		goto out; +	} else if (!skip_sum) { +		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); +		if (ret) +			goto out;  	}  mapit: -	return btrfs_map_bio(root, rw, bio, mirror_num, 0); +	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + +out: +	if (ret < 0) +		bio_endio(bio, ret); +	return ret;  }  /* @@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,  			      struct extent_state **cached_state)  { -	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) -		WARN_ON(1); +	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);  	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,  				   cached_state, GFP_NOFS);  } @@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ -		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); -		if (!ret) { -			if (nolock) -				trans = btrfs_join_transaction_nolock(root); -			else -				trans = btrfs_join_transaction(root); -			if (IS_ERR(trans)) { -				ret = PTR_ERR(trans); -				trans = NULL; -				goto out; -			} -			trans->block_rsv = &root->fs_info->delalloc_block_rsv; -			ret = btrfs_update_inode_fallback(trans, root, inode); -			if (ret) /* -ENOMEM or corruption */ -				btrfs_abort_transaction(trans, root, ret); +		btrfs_ordered_update_i_size(inode, 0, ordered_extent); +		if (nolock) +			trans = btrfs_join_transaction_nolock(root); +		else +			trans = btrfs_join_transaction(root); +		if (IS_ERR(trans)) { +			ret = PTR_ERR(trans); +			trans = NULL; +			goto out;  		} +		trans->block_rsv = &root->fs_info->delalloc_block_rsv; +		ret = btrfs_update_inode_fallback(trans, root, inode); +		if (ret) /* -ENOMEM or corruption */ +			btrfs_abort_transaction(trans, root, ret);  		goto out;  	} @@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  	add_pending_csums(trans, inode, ordered_extent->file_offset,  			  &ordered_extent->list); -	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); -	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { -		ret = btrfs_update_inode_fallback(trans, root, inode); -		if (ret) { /* -ENOMEM or corruption */ -			btrfs_abort_transaction(trans, root, ret); -			goto out_unlock; -		} -	} else { -		btrfs_set_inode_last_trans(trans, inode); +	btrfs_ordered_update_i_size(inode, 0, ordered_extent); +	ret = btrfs_update_inode_fallback(trans, root, inode); +	if (ret) { /* -ENOMEM or corruption */ +		btrfs_abort_transaction(trans, root, ret); +		goto out_unlock;  	}  	ret = 0;  out_unlock: @@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)  	struct btrfs_trans_handle *trans;  	struct inode *inode = dentry->d_inode;  	int ret; -	unsigned long nr = 0;  	trans = __unlink_start_trans(dir, dentry);  	if (IS_ERR(trans)) @@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)  	}  out: -	nr = trans->blocks_used;  	__unlink_end_trans(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return ret;  } @@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  	int err = 0;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct btrfs_trans_handle *trans; -	unsigned long nr = 0;  	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)  		return -ENOTEMPTY; @@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)  	if (!err)  		btrfs_i_size_write(inode, 0);  out: -	nr = trans->blocks_used;  	__unlink_end_trans(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,  	if (ret)  		goto out; -	ret = -ENOMEM;  again:  	page = find_or_create_page(mapping, index, mask);  	if (!page) {  		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +		ret = -ENOMEM;  		goto out;  	} @@ -3550,7 +3595,6 @@ again:  		goto out_unlock;  	} -	ret = 0;  	if (offset != PAGE_CACHE_SIZE) {  		if (!len)  			len = PAGE_CACHE_SIZE - offset; @@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)  			hole_em->block_start = EXTENT_MAP_HOLE;  			hole_em->block_len = 0; +			hole_em->orig_block_len = 0;  			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;  			hole_em->compress_type = BTRFS_COMPRESS_NONE;  			hole_em->generation = trans->transid; @@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_block_rsv *rsv, *global_rsv;  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); -	unsigned long nr;  	int ret;  	trace_btrfs_inode_evict(inode); @@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)  	 * inode item when doing the truncate.  	 */  	while (1) { -		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); +		ret = btrfs_block_rsv_refill(root, rsv, min_size, +					     BTRFS_RESERVE_FLUSH_LIMIT);  		/*  		 * Try and steal from the global reserve since we will @@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)  			goto no_delete;  		} -		trans = btrfs_start_transaction_noflush(root, 1); +		trans = btrfs_start_transaction_lflush(root, 1);  		if (IS_ERR(trans)) {  			btrfs_orphan_del(NULL, inode);  			btrfs_free_block_rsv(root, rsv); @@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)  		ret = btrfs_update_inode(trans, root, inode);  		BUG_ON(ret); -		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root);  		trans = NULL; -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root);  	}  	btrfs_free_block_rsv(root, rsv); @@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)  	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))  		btrfs_return_ino(root, btrfs_ino(inode)); -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  no_delete:  	clear_inode(inode);  	return; @@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,  	if (S_ISREG(mode)) {  		if (btrfs_test_opt(root, NODATASUM))  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; -		if (btrfs_test_opt(root, NODATACOW) || -		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) +		if (btrfs_test_opt(root, NODATACOW))  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;  	} @@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  	ret = btrfs_insert_dir_item(trans, root, name, name_len,  				    parent_inode, &key,  				    btrfs_inode_type(inode), index); -	if (ret == -EEXIST) +	if (ret == -EEXIST || ret == -EOVERFLOW)  		goto fail_dir_item;  	else if (ret) {  		btrfs_abort_transaction(trans, root, ret); @@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  	int err;  	int drop_inode = 0;  	u64 objectid; -	unsigned long nr = 0;  	u64 index = 0;  	if (!new_valid_dev(rdev)) @@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  		goto out_unlock;  	} +	err = btrfs_update_inode(trans, root, inode); +	if (err) { +		drop_inode = 1; +		goto out_unlock; +	} +  	/*  	* If the active LSM wants to access the inode during  	* d_instantiate it needs these. Smack checks to see @@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  		d_instantiate(dentry, inode);  	}  out_unlock: -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode); @@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct inode *inode = NULL; -	int drop_inode = 0; +	int drop_inode_on_err = 0;  	int err; -	unsigned long nr = 0;  	u64 objectid;  	u64 index = 0; @@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,  		err = PTR_ERR(inode);  		goto out_unlock;  	} +	drop_inode_on_err = 1;  	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); -	if (err) { -		drop_inode = 1; +	if (err) +		goto out_unlock; + +	err = btrfs_update_inode(trans, root, inode); +	if (err)  		goto out_unlock; -	}  	/*  	* If the active LSM wants to access the inode during @@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,  	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);  	if (err) -		drop_inode = 1; -	else { -		inode->i_mapping->a_ops = &btrfs_aops; -		inode->i_mapping->backing_dev_info = &root->fs_info->bdi; -		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -		d_instantiate(dentry, inode); -	} +		goto out_unlock; + +	inode->i_mapping->a_ops = &btrfs_aops; +	inode->i_mapping->backing_dev_info = &root->fs_info->bdi; +	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +	d_instantiate(dentry, inode); +  out_unlock: -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	if (drop_inode) { +	if (err && drop_inode_on_err) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	struct btrfs_root *root = BTRFS_I(dir)->root;  	struct inode *inode = old_dentry->d_inode;  	u64 index; -	unsigned long nr = 0;  	int err;  	int drop_inode = 0; @@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  	inode_inc_iversion(inode);  	inode->i_ctime = CURRENT_TIME;  	ihold(inode); +	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);  	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); @@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,  		btrfs_log_new_name(trans, inode, NULL, parent);  	} -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root);  fail:  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	int drop_on_err = 0;  	u64 objectid = 0;  	u64 index = 0; -	unsigned long nr = 1;  	/*  	 * 2 items for inode and ref @@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	drop_on_err = 0;  out_fail: -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root);  	if (drop_on_err)  		iput(inode); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -5340,6 +5384,7 @@ again:  		if (start + len <= found_key.offset)  			goto not_found;  		em->start = start; +		em->orig_start = start;  		em->len = found_key.offset - start;  		goto not_found_em;  	} @@ -5350,6 +5395,8 @@ again:  		em->len = extent_end - extent_start;  		em->orig_start = extent_start -  				 btrfs_file_extent_offset(leaf, item); +		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, +								      item);  		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);  		if (bytenr == 0) {  			em->block_start = EXTENT_MAP_HOLE; @@ -5359,8 +5406,7 @@ again:  			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  			em->compress_type = compress_type;  			em->block_start = bytenr; -			em->block_len = btrfs_file_extent_disk_num_bytes(leaf, -									 item); +			em->block_len = em->orig_block_len;  		} else {  			bytenr += btrfs_file_extent_offset(leaf, item);  			em->block_start = bytenr; @@ -5390,7 +5436,8 @@ again:  		em->start = extent_start + extent_offset;  		em->len = (copy_size + root->sectorsize - 1) &  			~((u64)root->sectorsize - 1); -		em->orig_start = EXTENT_MAP_INLINE; +		em->orig_block_len = em->len; +		em->orig_start = em->start;  		if (compress_type) {  			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);  			em->compress_type = compress_type; @@ -5439,11 +5486,11 @@ again:  				    extent_map_end(em) - 1, NULL, GFP_NOFS);  		goto insert;  	} else { -		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); -		WARN_ON(1); +		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);  	}  not_found:  	em->start = start; +	em->orig_start = start;  	em->len = len;  not_found_em:  	em->block_start = EXTENT_MAP_HOLE; @@ -5645,38 +5692,19 @@ out:  }  static struct extent_map *btrfs_new_extent_direct(struct inode *inode, -						  struct extent_map *em,  						  u64 start, u64 len)  {  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct btrfs_trans_handle *trans; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; +	struct extent_map *em;  	struct btrfs_key ins;  	u64 alloc_hint;  	int ret; -	bool insert = false; - -	/* -	 * Ok if the extent map we looked up is a hole and is for the exact -	 * range we want, there is no reason to allocate a new one, however if -	 * it is not right then we need to free this one and drop the cache for -	 * our range. -	 */ -	if (em->block_start != EXTENT_MAP_HOLE || em->start != start || -	    em->len != len) { -		free_extent_map(em); -		em = NULL; -		insert = true; -		btrfs_drop_extent_cache(inode, start, start + len - 1, 0); -	}  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans))  		return ERR_CAST(trans); -	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) -		btrfs_add_inode_defrag(trans, inode); -  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;  	alloc_hint = get_extent_allocation_hint(inode, start, len); @@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,  		goto out;  	} -	if (!em) { -		em = alloc_extent_map(); -		if (!em) { -			em = ERR_PTR(-ENOMEM); -			goto out; -		} -	} - -	em->start = start; -	em->orig_start = em->start; -	em->len = ins.offset; - -	em->block_start = ins.objectid; -	em->block_len = ins.offset; -	em->bdev = root->fs_info->fs_devices->latest_bdev; - -	/* -	 * We need to do this because if we're using the original em we searched -	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. -	 */ -	em->flags = 0; -	set_bit(EXTENT_FLAG_PINNED, &em->flags); - -	while (insert) { -		write_lock(&em_tree->lock); -		ret = add_extent_mapping(em_tree, em); -		write_unlock(&em_tree->lock); -		if (ret != -EEXIST) -			break; -		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); -	} +	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, +			      ins.offset, ins.offset, 0); +	if (IS_ERR(em)) +		goto out;  	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,  					   ins.offset, ins.offset, 0); @@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,  static struct extent_map *create_pinned_em(struct inode *inode, u64 start,  					   u64 len, u64 orig_start,  					   u64 block_start, u64 block_len, -					   int type) +					   u64 orig_block_len, int type)  {  	struct extent_map_tree *em_tree;  	struct extent_map *em; @@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,  	em->block_len = block_len;  	em->block_start = block_start;  	em->bdev = root->fs_info->fs_devices->latest_bdev; +	em->orig_block_len = orig_block_len; +	em->generation = -1;  	set_bit(EXTENT_FLAG_PINNED, &em->flags);  	if (type == BTRFS_ORDERED_PREALLOC) -		set_bit(EXTENT_FLAG_PREALLOC, &em->flags); +		set_bit(EXTENT_FLAG_FILLING, &em->flags);  	do {  		btrfs_drop_extent_cache(inode, em->start,  				em->start + em->len - 1, 0);  		write_lock(&em_tree->lock);  		ret = add_extent_mapping(em_tree, em); +		if (!ret) +			list_move(&em->list, +				  &em_tree->modified_extents);  		write_unlock(&em_tree->lock);  	} while (ret == -EEXIST); @@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  			goto must_cow;  		if (can_nocow_odirect(trans, inode, start, len) == 1) { -			u64 orig_start = em->start; +			u64 orig_start = em->orig_start; +			u64 orig_block_len = em->orig_block_len;  			if (type == BTRFS_ORDERED_PREALLOC) {  				free_extent_map(em);  				em = create_pinned_em(inode, start, len,  						       orig_start, -						       block_start, len, type); +						       block_start, len, +						       orig_block_len, type);  				if (IS_ERR(em)) {  					btrfs_end_transaction(trans, root);  					goto unlock_err; @@ -6077,7 +6085,8 @@ must_cow:  	 * it above  	 */  	len = bh_result->b_size; -	em = btrfs_new_extent_direct(inode, em, start, len); +	free_extent_map(em); +	em = btrfs_new_extent_direct(inode, start, len);  	if (IS_ERR(em)) {  		ret = PTR_ERR(em);  		goto unlock_err; @@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,  	struct btrfs_root *root = BTRFS_I(inode)->root;  	int ret; +	if (async_submit) +		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); +  	bio_get(bio);  	if (!write) { @@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  {  	struct inode *inode = dip->inode;  	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;  	struct bio *bio;  	struct bio *orig_bio = dip->orig_bio;  	struct bio_vec *bvec = orig_bio->bi_io_vec; @@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  	int async_submit = 0;  	map_length = orig_bio->bi_size; -	ret = btrfs_map_block(map_tree, READ, start_sector << 9, +	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,  			      &map_length, NULL, 0);  	if (ret) {  		bio_put(orig_bio); @@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,  			bio->bi_end_io = btrfs_end_dio_bio;  			map_length = orig_bio->bi_size; -			ret = btrfs_map_block(map_tree, READ, start_sector << 9, +			ret = btrfs_map_block(root->fs_info, READ, +					      start_sector << 9,  					      &map_length, NULL, 0);  			if (ret) {  				bio_put(bio); @@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,  		   btrfs_submit_direct, 0);  } +#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC) +  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,  		__u64 start, __u64 len)  { +	int	ret; + +	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); +	if (ret) +		return ret; +  	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);  } @@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)  	int ret;  	int err = 0;  	struct btrfs_trans_handle *trans; -	unsigned long nr;  	u64 mask = root->sectorsize - 1;  	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); @@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)  			break;  		} -		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root); -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root);  		trans = btrfs_start_transaction(root, 2);  		if (IS_ERR(trans)) { @@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)  		if (ret && !err)  			err = ret; -		nr = trans->blocks_used;  		ret = btrfs_end_transaction(trans, root); -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root);  	}  out: @@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)  	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);  	ei->io_tree.track_uptodate = 1;  	ei->io_failure_tree.track_uptodate = 1; +	atomic_set(&ei->sync_writers, 0);  	mutex_init(&ei->log_mutex);  	mutex_init(&ei->delalloc_mutex);  	btrfs_ordered_inode_tree_init(&ei->ordered_tree); @@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)  		kmem_cache_destroy(btrfs_path_cachep);  	if (btrfs_free_space_cachep)  		kmem_cache_destroy(btrfs_free_space_cachep); +	if (btrfs_delalloc_work_cachep) +		kmem_cache_destroy(btrfs_delalloc_work_cachep);  }  int btrfs_init_cachep(void) @@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)  	if (!btrfs_free_space_cachep)  		goto fail; +	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", +			sizeof(struct btrfs_delalloc_work), 0, +			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, +			NULL); +	if (!btrfs_delalloc_work_cachep) +		goto fail; +  	return 0;  fail:  	btrfs_destroy_cachep(); @@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (S_ISDIR(old_inode->i_mode) && new_inode &&  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)  		return -ENOTEMPTY; + + +	/* check for collisions, even if the  name isn't there */ +	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, +			     new_dentry->d_name.name, +			     new_dentry->d_name.len); + +	if (ret) { +		if (ret == -EEXIST) { +			/* we shouldn't get +			 * eexist without a new_inode */ +			if (!new_inode) { +				WARN_ON(1); +				return ret; +			} +		} else { +			/* maybe -EOVERFLOW */ +			return ret; +		} +	} +	ret = 0; +  	/*  	 * we're using rename to replace one file with another.  	 * and the replacement file is large.  Start IO on it now so @@ -7447,6 +7496,49 @@ out_notrans:  	return ret;  } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ +	struct btrfs_delalloc_work *delalloc_work; + +	delalloc_work = container_of(work, struct btrfs_delalloc_work, +				     work); +	if (delalloc_work->wait) +		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); +	else +		filemap_flush(delalloc_work->inode->i_mapping); + +	if (delalloc_work->delay_iput) +		btrfs_add_delayed_iput(delalloc_work->inode); +	else +		iput(delalloc_work->inode); +	complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, +						    int wait, int delay_iput) +{ +	struct btrfs_delalloc_work *work; + +	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); +	if (!work) +		return NULL; + +	init_completion(&work->completion); +	INIT_LIST_HEAD(&work->list); +	work->inode = inode; +	work->wait = wait; +	work->delay_iput = delay_iput; +	work->work.func = btrfs_run_delalloc_work; + +	return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ +	wait_for_completion(&work->completion); +	kmem_cache_free(btrfs_delalloc_work_cachep, work); +} +  /*   * some fairly slow code that needs optimization. This walks the list   * of all the inodes with pending delalloc and forces them to disk. @@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  	struct list_head *head = &root->fs_info->delalloc_inodes;  	struct btrfs_inode *binode;  	struct inode *inode; +	struct btrfs_delalloc_work *work, *next; +	struct list_head works; +	int ret = 0;  	if (root->fs_info->sb->s_flags & MS_RDONLY)  		return -EROFS; +	INIT_LIST_HEAD(&works); +  	spin_lock(&root->fs_info->delalloc_lock);  	while (!list_empty(head)) {  		binode = list_entry(head->next, struct btrfs_inode, @@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  			list_del_init(&binode->delalloc_inodes);  		spin_unlock(&root->fs_info->delalloc_lock);  		if (inode) { -			filemap_flush(inode->i_mapping); -			if (delay_iput) -				btrfs_add_delayed_iput(inode); -			else -				iput(inode); +			work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); +			if (!work) { +				ret = -ENOMEM; +				goto out; +			} +			list_add_tail(&work->list, &works); +			btrfs_queue_worker(&root->fs_info->flush_workers, +					   &work->work);  		}  		cond_resched();  		spin_lock(&root->fs_info->delalloc_lock); @@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)  		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));  	}  	atomic_dec(&root->fs_info->async_submit_draining); -	return 0; +out: +	list_for_each_entry_safe(work, next, &works, list) { +		list_del_init(&work->list); +		btrfs_wait_and_free_delalloc_work(work); +	} +	return ret;  }  static int btrfs_symlink(struct inode *dir, struct dentry *dentry, @@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  	unsigned long ptr;  	struct btrfs_file_extent_item *ei;  	struct extent_buffer *leaf; -	unsigned long nr = 0;  	name_len = strlen(symname) + 1;  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) @@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,  out_unlock:  	if (!err)  		d_instantiate(dentry, inode); -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root);  	if (drop_inode) {  		inode_dec_link_count(inode);  		iput(inode);  	} -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	return err;  } @@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		em->len = ins.offset;  		em->block_start = ins.objectid;  		em->block_len = ins.offset; +		em->orig_block_len = ins.offset;  		em->bdev = root->fs_info->fs_devices->latest_bdev;  		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);  		em->generation = trans->transid; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8fcf9a59c28d..4b4516770f05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@  #include "backref.h"  #include "rcu-string.h"  #include "send.h" +#include "dev-replace.h"  /* Mask out flags that are inappropriate for the given type of inode. */  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)  		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;  	} -	if (flags & BTRFS_INODE_NODATACOW) +	if (flags & BTRFS_INODE_NODATACOW) {  		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; +		if (S_ISREG(inode->i_mode)) +			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; +	}  	btrfs_update_iflags(inode);  } @@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,  		ret = btrfs_commit_transaction(trans,  					       root->fs_info->extent_root);  	} -	if (ret) +	if (ret) { +		/* cleanup_transaction has freed this for us */ +		if (trans->aborted) +			pending_snapshot = NULL;  		goto fail; +	}  	ret = pending_snapshot->error;  	if (ret) @@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,  	if (error)  		goto out_dput; +	/* +	 * even if this name doesn't exist, we may get hash collisions. +	 * check for them now when we can safely fail +	 */ +	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, +					       dir->i_ino, name, +					       namelen); +	if (error) +		goto out_dput; +  	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);  	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) @@ -1225,7 +1243,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		}  		defrag_count += ret; -		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); +		balance_dirty_pages_ratelimited(inode->i_mapping);  		mutex_unlock(&inode->i_mutex);  		if (newer_than) { @@ -1293,12 +1311,13 @@ out_ra:  	return ret;  } -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, +static noinline int btrfs_ioctl_resize(struct file *file,  					void __user *arg)  {  	u64 new_size;  	u64 old_size;  	u64 devid = 1; +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_vol_args *vol_args;  	struct btrfs_trans_handle *trans;  	struct btrfs_device *device = NULL; @@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	mutex_lock(&root->fs_info->volume_mutex); -	if (root->fs_info->balance_ctl) { -		printk(KERN_INFO "btrfs: balance in progress\n"); -		ret = -EINVAL; -		goto out; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; + +	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, +			1)) { +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +		return -EINPROGRESS;  	} +	mutex_lock(&root->fs_info->volume_mutex);  	vol_args = memdup_user(arg, sizeof(*vol_args));  	if (IS_ERR(vol_args)) {  		ret = PTR_ERR(vol_args); @@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  		printk(KERN_INFO "btrfs: resizing devid %llu\n",  		       (unsigned long long)devid);  	} -	device = btrfs_find_device(root, devid, NULL, NULL); +	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);  	if (!device) {  		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",  		       (unsigned long long)devid); @@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  		}  	} +	if (device->is_tgtdev_for_dev_replace) { +		ret = -EINVAL; +		goto out_free; +	} +  	old_size = device->total_bytes;  	if (mod < 0) { @@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,  		btrfs_commit_transaction(trans, root);  	} else if (new_size < old_size) {  		ret = btrfs_shrink_device(device, new_size); -	} +	} /* equal, nothing need to do */  out_free:  	kfree(vol_args);  out:  	mutex_unlock(&root->fs_info->volume_mutex); +	mnt_drop_write_file(file); +	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);  	return ret;  } @@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  	if (btrfs_root_readonly(root))  		return -EROFS; +	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, +			1)) { +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +		return -EINPROGRESS; +	}  	ret = mnt_want_write_file(file); -	if (ret) +	if (ret) { +		atomic_set(&root->fs_info->mutually_exclusive_operation_running, +			   0);  		return ret; +	}  	switch (inode->i_mode & S_IFMT) {  	case S_IFDIR: @@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  	}  out:  	mnt_drop_write_file(file); +	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);  	return ret;  } @@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	mutex_lock(&root->fs_info->volume_mutex); -	if (root->fs_info->balance_ctl) { -		printk(KERN_INFO "btrfs: balance in progress\n"); -		ret = -EINVAL; -		goto out; +	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, +			1)) { +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +		return -EINPROGRESS;  	} +	mutex_lock(&root->fs_info->volume_mutex);  	vol_args = memdup_user(arg, sizeof(*vol_args));  	if (IS_ERR(vol_args)) {  		ret = PTR_ERR(vol_args); @@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)  	kfree(vol_args);  out:  	mutex_unlock(&root->fs_info->volume_mutex); +	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);  	return ret;  } -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_vol_args *vol_args;  	int ret;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; -	mutex_lock(&root->fs_info->volume_mutex); -	if (root->fs_info->balance_ctl) { -		printk(KERN_INFO "btrfs: balance in progress\n"); -		ret = -EINVAL; -		goto out; +	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, +			1)) { +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +		mnt_drop_write_file(file); +		return -EINPROGRESS;  	} +	mutex_lock(&root->fs_info->volume_mutex);  	vol_args = memdup_user(arg, sizeof(*vol_args));  	if (IS_ERR(vol_args)) {  		ret = PTR_ERR(vol_args); @@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)  	kfree(vol_args);  out:  	mutex_unlock(&root->fs_info->volume_mutex); +	mnt_drop_write_file(file); +	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);  	return ret;  } @@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)  		s_uuid = di_args->uuid;  	mutex_lock(&fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); +	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);  	mutex_unlock(&fs_devices->device_list_mutex);  	if (!dev) { @@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	struct btrfs_disk_key disk_key;  	u64 objectid = 0;  	u64 dir_id; +	int ret;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (copy_from_user(&objectid, argp, sizeof(objectid))) -		return -EFAULT; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret; + +	if (copy_from_user(&objectid, argp, sizeof(objectid))) { +		ret = -EFAULT; +		goto out; +	}  	if (!objectid)  		objectid = root->root_key.objectid; @@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	location.offset = (u64)-1;  	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); -	if (IS_ERR(new_root)) -		return PTR_ERR(new_root); +	if (IS_ERR(new_root)) { +		ret = PTR_ERR(new_root); +		goto out; +	} -	if (btrfs_root_refs(&new_root->root_item) == 0) -		return -ENOENT; +	if (btrfs_root_refs(&new_root->root_item) == 0) { +		ret = -ENOENT; +		goto out; +	}  	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; +	if (!path) { +		ret = -ENOMEM; +		goto out; +	}  	path->leave_spinning = 1;  	trans = btrfs_start_transaction(root, 1);  	if (IS_ERR(trans)) {  		btrfs_free_path(path); -		return PTR_ERR(trans); +		ret = PTR_ERR(trans); +		goto out;  	}  	dir_id = btrfs_super_root_dir(root->fs_info->super_copy); @@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  		btrfs_end_transaction(trans, root);  		printk(KERN_ERR "Umm, you don't have the default dir item, "  		       "this isn't going to work\n"); -		return -ENOENT; +		ret = -ENOENT; +		goto out;  	}  	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); @@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)  	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);  	btrfs_end_transaction(trans, root); - -	return 0; +out: +	mnt_drop_write_file(file); +	return ret;  }  void btrfs_get_block_group_info(struct list_head *groups_list, @@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)  	return 0;  } -static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, +					    void __user *argp)  { -	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;  	struct btrfs_trans_handle *trans;  	u64 transid;  	int ret; -	trans = btrfs_start_transaction(root, 0); -	if (IS_ERR(trans)) -		return PTR_ERR(trans); +	trans = btrfs_attach_transaction(root); +	if (IS_ERR(trans)) { +		if (PTR_ERR(trans) != -ENOENT) +			return PTR_ERR(trans); + +		/* No running transaction, don't bother */ +		transid = root->fs_info->last_trans_committed; +		goto out; +	}  	transid = trans->transid;  	ret = btrfs_commit_transaction_async(trans, root, 0);  	if (ret) {  		btrfs_end_transaction(trans, root);  		return ret;  	} - +out:  	if (argp)  		if (copy_to_user(argp, &transid, sizeof(transid)))  			return -EFAULT;  	return 0;  } -static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, +					   void __user *argp)  { -	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;  	u64 transid;  	if (argp) { @@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)  	return btrfs_wait_for_commit(root, transid);  } -static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_scrub(struct file *file, void __user *arg)  { -	int ret; +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_scrub_args *sa; +	int ret;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)  	if (IS_ERR(sa))  		return PTR_ERR(sa); -	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, -			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); +	if (!(sa->flags & BTRFS_SCRUB_READONLY)) { +		ret = mnt_want_write_file(file); +		if (ret) +			goto out; +	} + +	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, +			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, +			      0);  	if (copy_to_user(arg, sa, sizeof(*sa)))  		ret = -EFAULT; +	if (!(sa->flags & BTRFS_SCRUB_READONLY)) +		mnt_drop_write_file(file); +out:  	kfree(sa);  	return ret;  } @@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	return btrfs_scrub_cancel(root); +	return btrfs_scrub_cancel(root->fs_info);  }  static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, @@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,  	return ret;  } +static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +{ +	struct btrfs_ioctl_dev_replace_args *p; +	int ret; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	p = memdup_user(arg, sizeof(*p)); +	if (IS_ERR(p)) +		return PTR_ERR(p); + +	switch (p->cmd) { +	case BTRFS_IOCTL_DEV_REPLACE_CMD_START: +		if (atomic_xchg( +			&root->fs_info->mutually_exclusive_operation_running, +			1)) { +			pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); +			ret = -EINPROGRESS; +		} else { +			ret = btrfs_dev_replace_start(root, p); +			atomic_set( +			 &root->fs_info->mutually_exclusive_operation_running, +			 0); +		} +		break; +	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: +		btrfs_dev_replace_status(root->fs_info, p); +		ret = 0; +		break; +	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: +		ret = btrfs_dev_replace_cancel(root->fs_info, p); +		break; +	default: +		ret = -EINVAL; +		break; +	} + +	if (copy_to_user(arg, p, sizeof(*p))) +		ret = -EFAULT; + +	kfree(p); +	return ret; +} +  static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)  {  	int ret = 0; @@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)  	struct btrfs_ioctl_balance_args *bargs;  	struct btrfs_balance_control *bctl;  	int ret; +	int need_to_clear_lock = 0;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)  		bargs = NULL;  	} -	if (fs_info->balance_ctl) { +	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, +			1)) { +		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");  		ret = -EINPROGRESS;  		goto out_bargs;  	} +	need_to_clear_lock = 1;  	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);  	if (!bctl) { @@ -3387,6 +3514,9 @@ do_balance:  out_bargs:  	kfree(bargs);  out: +	if (need_to_clear_lock) +		atomic_set(&root->fs_info->mutually_exclusive_operation_running, +			   0);  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex);  	mnt_drop_write_file(file); @@ -3441,8 +3571,9 @@ out:  	return ret;  } -static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_quota_ctl_args *sa;  	struct btrfs_trans_handle *trans = NULL;  	int ret; @@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret;  	sa = memdup_user(arg, sizeof(*sa)); -	if (IS_ERR(sa)) -		return PTR_ERR(sa); +	if (IS_ERR(sa)) { +		ret = PTR_ERR(sa); +		goto drop_write; +	}  	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {  		trans = btrfs_start_transaction(root, 2); @@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)  		if (err && !ret)  			ret = err;  	} -  out:  	kfree(sa); +drop_write: +	mnt_drop_write_file(file);  	return ret;  } -static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_qgroup_assign_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret;  	sa = memdup_user(arg, sizeof(*sa)); -	if (IS_ERR(sa)) -		return PTR_ERR(sa); +	if (IS_ERR(sa)) { +		ret = PTR_ERR(sa); +		goto drop_write; +	}  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) { @@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)  out:  	kfree(sa); +drop_write: +	mnt_drop_write_file(file);  	return ret;  } -static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_qgroup_create_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret;  	sa = memdup_user(arg, sizeof(*sa)); -	if (IS_ERR(sa)) -		return PTR_ERR(sa); +	if (IS_ERR(sa)) { +		ret = PTR_ERR(sa); +		goto drop_write; +	}  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) { @@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)  out:  	kfree(sa); +drop_write: +	mnt_drop_write_file(file);  	return ret;  } -static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)  { +	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;  	struct btrfs_ioctl_qgroup_limit_args *sa;  	struct btrfs_trans_handle *trans;  	int ret; @@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	if (root->fs_info->sb->s_flags & MS_RDONLY) -		return -EROFS; +	ret = mnt_want_write_file(file); +	if (ret) +		return ret;  	sa = memdup_user(arg, sizeof(*sa)); -	if (IS_ERR(sa)) -		return PTR_ERR(sa); +	if (IS_ERR(sa)) { +		ret = PTR_ERR(sa); +		goto drop_write; +	}  	trans = btrfs_join_transaction(root);  	if (IS_ERR(trans)) { @@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)  out:  	kfree(sa); +drop_write: +	mnt_drop_write_file(file);  	return ret;  } @@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int  	case BTRFS_IOC_DEFRAG_RANGE:  		return btrfs_ioctl_defrag(file, argp);  	case BTRFS_IOC_RESIZE: -		return btrfs_ioctl_resize(root, argp); +		return btrfs_ioctl_resize(file, argp);  	case BTRFS_IOC_ADD_DEV:  		return btrfs_ioctl_add_dev(root, argp);  	case BTRFS_IOC_RM_DEV: -		return btrfs_ioctl_rm_dev(root, argp); +		return btrfs_ioctl_rm_dev(file, argp);  	case BTRFS_IOC_FS_INFO:  		return btrfs_ioctl_fs_info(root, argp);  	case BTRFS_IOC_DEV_INFO: @@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int  		btrfs_sync_fs(file->f_dentry->d_sb, 1);  		return 0;  	case BTRFS_IOC_START_SYNC: -		return btrfs_ioctl_start_sync(file, argp); +		return btrfs_ioctl_start_sync(root, argp);  	case BTRFS_IOC_WAIT_SYNC: -		return btrfs_ioctl_wait_sync(file, argp); +		return btrfs_ioctl_wait_sync(root, argp);  	case BTRFS_IOC_SCRUB: -		return btrfs_ioctl_scrub(root, argp); +		return btrfs_ioctl_scrub(file, argp);  	case BTRFS_IOC_SCRUB_CANCEL:  		return btrfs_ioctl_scrub_cancel(root, argp);  	case BTRFS_IOC_SCRUB_PROGRESS: @@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int  	case BTRFS_IOC_GET_DEV_STATS:  		return btrfs_ioctl_get_dev_stats(root, argp);  	case BTRFS_IOC_QUOTA_CTL: -		return btrfs_ioctl_quota_ctl(root, argp); +		return btrfs_ioctl_quota_ctl(file, argp);  	case BTRFS_IOC_QGROUP_ASSIGN: -		return btrfs_ioctl_qgroup_assign(root, argp); +		return btrfs_ioctl_qgroup_assign(file, argp);  	case BTRFS_IOC_QGROUP_CREATE: -		return btrfs_ioctl_qgroup_create(root, argp); +		return btrfs_ioctl_qgroup_create(file, argp);  	case BTRFS_IOC_QGROUP_LIMIT: -		return btrfs_ioctl_qgroup_limit(root, argp); +		return btrfs_ioctl_qgroup_limit(file, argp); +	case BTRFS_IOC_DEV_REPLACE: +		return btrfs_ioctl_dev_replace(root, argp);  	}  	return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 731e2875ab93..dabca9cc8c2e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {  	char name[BTRFS_PATH_NAME_MAX + 1];  }; +#define BTRFS_DEVICE_PATH_NAME_MAX 1024 +  #define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)  #define BTRFS_SUBVOL_RDONLY		(1ULL << 1)  #define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2) @@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {  	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];  }; -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0 +#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1 +struct btrfs_ioctl_dev_replace_start_params { +	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */ +	__u64 cont_reading_from_srcdev_mode;	/* in, see #define +						 * above */ +	__u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */ +	__u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3 +#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4 +struct btrfs_ioctl_dev_replace_status_params { +	__u64 replace_state;	/* out, see #define above */ +	__u64 progress_1000;	/* out, 0 <= x <= 1000 */ +	__u64 time_started;	/* out, seconds since 1-Jan-1970 */ +	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */ +	__u64 num_write_errors;	/* out */ +	__u64 num_uncorrectable_read_errors;	/* out */ +}; + +#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1 +#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2 +struct btrfs_ioctl_dev_replace_args { +	__u64 cmd;	/* in */ +	__u64 result;	/* out */ + +	union { +		struct btrfs_ioctl_dev_replace_start_params start; +		struct btrfs_ioctl_dev_replace_status_params status; +	};	/* in/out */ + +	__u64 spare[64]; +}; +  struct btrfs_ioctl_dev_info_args {  	__u64 devid;				/* in/out */  	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */ @@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {  			       struct btrfs_ioctl_qgroup_limit_args)  #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \  				      struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ +				    struct btrfs_ioctl_dev_replace_args) +  #endif diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h new file mode 100644 index 000000000000..b7816cefbd13 --- /dev/null +++ b/fs/btrfs/math.h @@ -0,0 +1,44 @@ + +/* + * Copyright (C) 2012 Fujitsu.  All rights reserved. + * Written by Miao Xie <[email protected]> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_MATH_H +#define __BTRFS_MATH_H + +#include <asm/div64.h> + +static inline u64 div_factor(u64 num, int factor) +{ +	if (factor == 10) +		return num; +	num *= factor; +	do_div(num, 10); +	return num; +} + +static inline u64 div_factor_fine(u64 num, int factor) +{ +	if (factor == 100) +		return num; +	num *= factor; +	do_div(num, 100); +	return num; +} + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02ba28e..f10731297040 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,  	init_waitqueue_head(&entry->wait);  	INIT_LIST_HEAD(&entry->list);  	INIT_LIST_HEAD(&entry->root_extent_list); +	INIT_LIST_HEAD(&entry->work_list); +	init_completion(&entry->completion);  	trace_btrfs_ordered_extent_add(inode, entry); @@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,  	wake_up(&entry->wait);  } +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ +	struct btrfs_ordered_extent *ordered; + +	ordered = container_of(work, struct btrfs_ordered_extent, flush_work); +	btrfs_start_ordered_extent(ordered->inode, ordered, 1); +	complete(&ordered->completion); +} +  /*   * wait for all the ordered extents in a root.  This is done when balancing   * space between drives.   */  void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)  { -	struct list_head splice; +	struct list_head splice, works;  	struct list_head *cur; -	struct btrfs_ordered_extent *ordered; +	struct btrfs_ordered_extent *ordered, *next;  	struct inode *inode;  	INIT_LIST_HEAD(&splice); +	INIT_LIST_HEAD(&works);  	spin_lock(&root->fs_info->ordered_extent_lock);  	list_splice_init(&root->fs_info->ordered_extents, &splice); @@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)  		spin_unlock(&root->fs_info->ordered_extent_lock);  		if (inode) { -			btrfs_start_ordered_extent(inode, ordered, 1); -			btrfs_put_ordered_extent(ordered); -			if (delay_iput) -				btrfs_add_delayed_iput(inode); -			else -				iput(inode); +			ordered->flush_work.func = btrfs_run_ordered_extent_work; +			list_add_tail(&ordered->work_list, &works); +			btrfs_queue_worker(&root->fs_info->flush_workers, +					   &ordered->flush_work);  		} else {  			btrfs_put_ordered_extent(ordered);  		} +		cond_resched();  		spin_lock(&root->fs_info->ordered_extent_lock);  	}  	spin_unlock(&root->fs_info->ordered_extent_lock); + +	list_for_each_entry_safe(ordered, next, &works, work_list) { +		list_del_init(&ordered->work_list); +		wait_for_completion(&ordered->completion); + +		inode = ordered->inode; +		btrfs_put_ordered_extent(ordered); +		if (delay_iput) +			btrfs_add_delayed_iput(inode); +		else +			iput(inode); + +		cond_resched(); +	}  }  /* @@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)   * extra check to make sure the ordered operation list really is empty   * before we return   */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)  {  	struct btrfs_inode *btrfs_inode;  	struct inode *inode;  	struct list_head splice; +	struct list_head works; +	struct btrfs_delalloc_work *work, *next; +	int ret = 0;  	INIT_LIST_HEAD(&splice); +	INIT_LIST_HEAD(&works);  	mutex_lock(&root->fs_info->ordered_operations_mutex);  	spin_lock(&root->fs_info->ordered_extent_lock); @@ -533,6 +562,7 @@ again:  	list_splice_init(&root->fs_info->ordered_operations, &splice);  	while (!list_empty(&splice)) { +  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,  				   ordered_operations); @@ -549,15 +579,26 @@ again:  			list_add_tail(&BTRFS_I(inode)->ordered_operations,  			      &root->fs_info->ordered_operations);  		} + +		if (!inode) +			continue;  		spin_unlock(&root->fs_info->ordered_extent_lock); -		if (inode) { -			if (wait) -				btrfs_wait_ordered_range(inode, 0, (u64)-1); -			else -				filemap_flush(inode->i_mapping); -			btrfs_add_delayed_iput(inode); +		work = btrfs_alloc_delalloc_work(inode, wait, 1); +		if (!work) { +			if (list_empty(&BTRFS_I(inode)->ordered_operations)) +				list_add_tail(&btrfs_inode->ordered_operations, +					      &splice); +			spin_lock(&root->fs_info->ordered_extent_lock); +			list_splice_tail(&splice, +					 &root->fs_info->ordered_operations); +			spin_unlock(&root->fs_info->ordered_extent_lock); +			ret = -ENOMEM; +			goto out;  		} +		list_add_tail(&work->list, &works); +		btrfs_queue_worker(&root->fs_info->flush_workers, +				   &work->work);  		cond_resched();  		spin_lock(&root->fs_info->ordered_extent_lock); @@ -566,7 +607,13 @@ again:  		goto again;  	spin_unlock(&root->fs_info->ordered_extent_lock); +out: +	list_for_each_entry_safe(work, next, &works, list) { +		list_del_init(&work->list); +		btrfs_wait_and_free_delalloc_work(work); +	}  	mutex_unlock(&root->fs_info->ordered_operations_mutex); +	return ret;  }  /* @@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  	u64 end;  	u64 orig_end;  	struct btrfs_ordered_extent *ordered; -	int found;  	if (start + len < start) {  		orig_end = INT_LIMIT(loff_t); @@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  	filemap_fdatawait_range(inode->i_mapping, start, orig_end);  	end = orig_end; -	found = 0;  	while (1) {  		ordered = btrfs_lookup_first_ordered_extent(inode, end);  		if (!ordered) @@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)  			btrfs_put_ordered_extent(ordered);  			break;  		} -		found++;  		btrfs_start_ordered_extent(inode, ordered, 1);  		end = ordered->file_offset;  		btrfs_put_ordered_extent(ordered); @@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  	if (last_mod < root->fs_info->last_trans_committed)  		return; -	/* -	 * the transaction is already committing.  Just start the IO and -	 * don't bother with all of this list nonsense -	 */ -	if (trans && root->fs_info->running_transaction->blocked) { -		btrfs_wait_ordered_range(inode, 0, (u64)-1); -		return; -	} -  	spin_lock(&root->fs_info->ordered_extent_lock);  	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {  		list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -959,6 +994,7 @@ int __init ordered_data_init(void)  				     NULL);  	if (!btrfs_ordered_extent_cache)  		return -ENOMEM; +  	return 0;  } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b46a37..f29d4bf5fbe7 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -76,7 +76,7 @@ struct btrfs_ordered_sum {  #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent  				       * has done its due diligence in updating  				       * the isize. */ @@ -128,8 +128,11 @@ struct btrfs_ordered_extent {  	struct list_head root_extent_list;  	struct btrfs_work work; -}; +	struct completion completion; +	struct btrfs_work flush_work; +	struct list_head work_list; +};  /*   * calculates the total size you need to allocate for an ordered sum @@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,  int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,  				struct btrfs_ordered_extent *ordered);  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,  				 struct btrfs_root *root,  				 struct inode *inode); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5e23684887eb..50d95fd190a5 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)  		case BTRFS_DEV_STATS_KEY:  			printk(KERN_INFO "\t\tdevice stats\n");  			break; +		case BTRFS_DEV_REPLACE_KEY: +			printk(KERN_INFO "\t\tdev replace\n"); +			break;  		};  	}  } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a955669519a2..96b93daa0bbb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -27,6 +27,7 @@  #include "volumes.h"  #include "disk-io.h"  #include "transaction.h" +#include "dev-replace.h"  #undef DEBUG @@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	struct reada_extent *re = NULL;  	struct reada_extent *re_exist = NULL;  	struct btrfs_fs_info *fs_info = root->fs_info; -	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	struct btrfs_bio *bbio = NULL;  	struct btrfs_device *dev;  	struct btrfs_device *prev_dev; @@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	int nzones = 0;  	int i;  	unsigned long index = logical >> PAGE_CACHE_SHIFT; +	int dev_replace_is_ongoing;  	spin_lock(&fs_info->reada_lock);  	re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	 * map block  	 */  	length = blocksize; -	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); +	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, +			      &bbio, 0);  	if (ret || !bbio || length < blocksize)  		goto error; @@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  	}  	/* insert extent in reada_tree + all per-device trees, all or nothing */ +	btrfs_dev_replace_lock(&fs_info->dev_replace);  	spin_lock(&fs_info->reada_lock);  	ret = radix_tree_insert(&fs_info->reada_tree, index, re);  	if (ret == -EEXIST) { @@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  		BUG_ON(!re_exist);  		re_exist->refcnt++;  		spin_unlock(&fs_info->reada_lock); +		btrfs_dev_replace_unlock(&fs_info->dev_replace);  		goto error;  	}  	if (ret) {  		spin_unlock(&fs_info->reada_lock); +		btrfs_dev_replace_unlock(&fs_info->dev_replace);  		goto error;  	}  	prev_dev = NULL; +	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( +			&fs_info->dev_replace);  	for (i = 0; i < nzones; ++i) {  		dev = bbio->stripes[i].dev;  		if (dev == prev_dev) { @@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,  			 */  			continue;  		} +		if (!dev->bdev) { +			/* cannot read ahead on missing device */ +			continue; +		} +		if (dev_replace_is_ongoing && +		    dev == fs_info->dev_replace.tgtdev) { +			/* +			 * as this device is selected for reading only as +			 * a last resort, skip it for read ahead. +			 */ +			continue; +		}  		prev_dev = dev;  		ret = radix_tree_insert(&dev->reada_extents, index, re);  		if (ret) {  			while (--i >= 0) {  				dev = bbio->stripes[i].dev;  				BUG_ON(dev == NULL); +				/* ignore whether the entry was inserted */  				radix_tree_delete(&dev->reada_extents, index);  			}  			BUG_ON(fs_info == NULL);  			radix_tree_delete(&fs_info->reada_tree, index);  			spin_unlock(&fs_info->reada_lock); +			btrfs_dev_replace_unlock(&fs_info->dev_replace);  			goto error;  		}  	}  	spin_unlock(&fs_info->reada_lock); +	btrfs_dev_replace_unlock(&fs_info->dev_replace);  	kfree(bbio);  	return re; @@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,  	generation = btrfs_header_generation(node);  	free_extent_buffer(node); -	reada_add_block(rc, start, &max_key, level, generation); +	if (reada_add_block(rc, start, &max_key, level, generation)) { +		kfree(rc); +		return ERR_PTR(-ENOMEM); +	}  	reada_start_machine(root->fs_info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa128fc..300e09ac3659 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  	struct btrfs_root_item *root_item;  	struct btrfs_path *path;  	struct extent_buffer *leaf; -	unsigned long nr;  	int level;  	int max_level;  	int replaced = 0; @@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  		BUG_ON(IS_ERR(trans));  		trans->block_rsv = rc->block_rsv; -		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); +		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, +					     BTRFS_RESERVE_FLUSH_ALL);  		if (ret) {  			BUG_ON(ret != -EAGAIN);  			ret = btrfs_commit_transaction(trans, root); @@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,  			       path->slots[level]);  		root_item->drop_level = level; -		nr = trans->blocks_used;  		btrfs_end_transaction_throttle(trans, root); -		btrfs_btree_balance_dirty(root, nr); +		btrfs_btree_balance_dirty(root);  		if (replaced && rc->stage == UPDATE_DATA_PTRS)  			invalidate_extent_cache(root, &key, &next_key); @@ -2155,10 +2154,9 @@ out:  		btrfs_update_reloc_root(trans, root);  	} -	nr = trans->blocks_used;  	btrfs_end_transaction_throttle(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	if (replaced && rc->stage == UPDATE_DATA_PTRS)  		invalidate_extent_cache(root, &key, &next_key); @@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)  again:  	if (!err) {  		num_bytes = rc->merging_rsv_size; -		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); +		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, +					  BTRFS_RESERVE_FLUSH_ALL);  		if (ret)  			err = ret;  	} @@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,  	num_bytes = calcu_metadata_size(rc, node, 1) * 2;  	trans->block_rsv = rc->block_rsv; -	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); +	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, +				  BTRFS_RESERVE_FLUSH_ALL);  	if (ret) {  		if (ret == -EAGAIN)  			rc->commit_transaction = 1; @@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,  	struct btrfs_path *path;  	struct btrfs_root *root = fs_info->tree_root;  	struct btrfs_trans_handle *trans; -	unsigned long nr;  	int ret = 0;  	if (inode) @@ -3293,9 +3292,8 @@ truncate:  	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);  	btrfs_free_path(path); -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  out:  	iput(inode);  	return ret; @@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)  	 * is no reservation in transaction handle.  	 */  	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, -				  rc->extent_root->nodesize * 256); +				  rc->extent_root->nodesize * 256, +				  BTRFS_RESERVE_FLUSH_ALL);  	if (ret)  		return ret; @@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)  	struct btrfs_trans_handle *trans = NULL;  	struct btrfs_path *path;  	struct btrfs_extent_item *ei; -	unsigned long nr;  	u64 flags;  	u32 item_size;  	int ret; @@ -3828,9 +3826,8 @@ restart:  			ret = btrfs_commit_transaction(trans, rc->extent_root);  			BUG_ON(ret);  		} else { -			nr = trans->blocks_used;  			btrfs_end_transaction_throttle(trans, rc->extent_root); -			btrfs_btree_balance_dirty(rc->extent_root, nr); +			btrfs_btree_balance_dirty(rc->extent_root);  		}  		trans = NULL; @@ -3860,9 +3857,8 @@ restart:  			  GFP_NOFS);  	if (trans) { -		nr = trans->blocks_used;  		btrfs_end_transaction_throttle(trans, rc->extent_root); -		btrfs_btree_balance_dirty(rc->extent_root, nr); +		btrfs_btree_balance_dirty(rc->extent_root);  	}  	if (!err) { @@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root;  	struct btrfs_key key; -	unsigned long nr;  	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;  	int err = 0; @@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,  	err = btrfs_orphan_add(trans, inode);  out: -	nr = trans->blocks_used;  	btrfs_end_transaction(trans, root); -	btrfs_btree_balance_dirty(root, nr); +	btrfs_btree_balance_dirty(root);  	if (err) {  		if (inode)  			iput(inode); @@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)  	       (unsigned long long)rc->block_group->key.objectid,  	       (unsigned long long)rc->block_group->flags); -	btrfs_start_delalloc_inodes(fs_info->tree_root, 0); +	ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); +	if (ret < 0) { +		err = ret; +		goto out; +	}  	btrfs_wait_ordered_extents(fs_info->tree_root, 0);  	while (1) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index eb923d087da7..668af537a3ea 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,  	struct btrfs_root_item *item = &root->root_item;  	struct timespec ct = CURRENT_TIME; -	spin_lock(&root->root_times_lock); +	spin_lock(&root->root_item_lock);  	item->ctransid = cpu_to_le64(trans->transid);  	item->ctime.sec = cpu_to_le64(ct.tv_sec);  	item->ctime.nsec = cpu_to_le32(ct.tv_nsec); -	spin_unlock(&root->root_times_lock); +	spin_unlock(&root->root_item_lock);  } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 27892f67e69b..bdbb94f245c9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,5 +1,5 @@  /* - * Copyright (C) 2011 STRATO.  All rights reserved. + * Copyright (C) 2011, 2012 STRATO.  All rights reserved.   *   * This program is free software; you can redistribute it and/or   * modify it under the terms of the GNU General Public @@ -25,6 +25,7 @@  #include "transaction.h"  #include "backref.h"  #include "extent_io.h" +#include "dev-replace.h"  #include "check-integrity.h"  #include "rcu-string.h" @@ -42,10 +43,23 @@   */  struct scrub_block; -struct scrub_dev; +struct scrub_ctx; -#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */ -#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */ +/* + * the following three values only influence the performance. + * The last one configures the number of parallel and outstanding I/O + * operations. The first two values configure an upper limit for the number + * of (dynamically allocated) pages that are added to a bio. + */ +#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */ +#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */ +#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */ + +/* + * the following value times PAGE_SIZE needs to be large enough to match the + * largest node/leaf/sector size that shall be supported. + * Values larger than BTRFS_STRIPE_LEN are not supported. + */  #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */  struct scrub_page { @@ -56,6 +70,8 @@ struct scrub_page {  	u64			generation;  	u64			logical;  	u64			physical; +	u64			physical_for_dev_replace; +	atomic_t		ref_count;  	struct {  		unsigned int	mirror_num:8;  		unsigned int	have_csum:1; @@ -66,23 +82,28 @@ struct scrub_page {  struct scrub_bio {  	int			index; -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx; +	struct btrfs_device	*dev;  	struct bio		*bio;  	int			err;  	u64			logical;  	u64			physical; -	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO]; +#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO +	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO]; +#else +	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO]; +#endif  	int			page_count;  	int			next_free;  	struct btrfs_work	work;  };  struct scrub_block { -	struct scrub_page	pagev[SCRUB_MAX_PAGES_PER_BLOCK]; +	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];  	int			page_count;  	atomic_t		outstanding_pages;  	atomic_t		ref_count; /* free mem on transition to zero */ -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx;  	struct {  		unsigned int	header_error:1;  		unsigned int	checksum_error:1; @@ -91,23 +112,35 @@ struct scrub_block {  	};  }; -struct scrub_dev { -	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV]; -	struct btrfs_device	*dev; +struct scrub_wr_ctx { +	struct scrub_bio *wr_curr_bio; +	struct btrfs_device *tgtdev; +	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ +	atomic_t flush_all_writes; +	struct mutex wr_lock; +}; + +struct scrub_ctx { +	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX]; +	struct btrfs_root	*dev_root;  	int			first_free;  	int			curr; -	atomic_t		in_flight; -	atomic_t		fixup_cnt; +	atomic_t		bios_in_flight; +	atomic_t		workers_pending;  	spinlock_t		list_lock;  	wait_queue_head_t	list_wait;  	u16			csum_size;  	struct list_head	csum_list;  	atomic_t		cancel_req;  	int			readonly; -	int			pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ +	int			pages_per_rd_bio;  	u32			sectorsize;  	u32			nodesize;  	u32			leafsize; + +	int			is_dev_replace; +	struct scrub_wr_ctx	wr_ctx; +  	/*  	 * statistics  	 */ @@ -116,13 +149,23 @@ struct scrub_dev {  };  struct scrub_fixup_nodatasum { -	struct scrub_dev	*sdev; +	struct scrub_ctx	*sctx; +	struct btrfs_device	*dev;  	u64			logical;  	struct btrfs_root	*root;  	struct btrfs_work	work;  	int			mirror_num;  }; +struct scrub_copy_nocow_ctx { +	struct scrub_ctx	*sctx; +	u64			logical; +	u64			len; +	int			mirror_num; +	u64			physical_for_dev_replace; +	struct btrfs_work	work; +}; +  struct scrub_warning {  	struct btrfs_path	*path;  	u64			extent_item_size; @@ -137,15 +180,20 @@ struct scrub_warning {  }; +static void scrub_pending_bio_inc(struct scrub_ctx *sctx); +static void scrub_pending_bio_dec(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, -				     struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, +				     struct btrfs_fs_info *fs_info, +				     struct scrub_block *original_sblock,  				     u64 length, u64 logical, -				     struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, -			       struct scrub_block *sblock, int is_metadata, -			       int have_csum, u8 *csum, u64 generation, -			       u16 csum_size); +				     struct scrub_block *sblocks_for_recheck); +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +				struct scrub_block *sblock, int is_metadata, +				int have_csum, u8 *csum, u64 generation, +				u16 csum_size);  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  					 struct scrub_block *sblock,  					 int is_metadata, int have_csum, @@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,  static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  					    struct scrub_block *sblock_good,  					    int page_num, int force_write); +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, +					   int page_num);  static int scrub_checksum_data(struct scrub_block *sblock);  static int scrub_checksum_tree_block(struct scrub_block *sblock);  static int scrub_checksum_super(struct scrub_block *sblock);  static void scrub_block_get(struct scrub_block *sblock);  static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, -				 struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, -		       u64 physical, u64 flags, u64 gen, int mirror_num, -		       u8 *csum, int force); +static void scrub_page_get(struct scrub_page *spage); +static void scrub_page_put(struct scrub_page *spage); +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage); +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +		       u64 physical, struct btrfs_device *dev, u64 flags, +		       u64 gen, int mirror_num, u8 *csum, int force, +		       u64 physical_for_dev_replace);  static void scrub_bio_end_io(struct bio *bio, int err);  static void scrub_bio_end_io_worker(struct btrfs_work *work);  static void scrub_block_complete(struct scrub_block *sblock); +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, +			       u64 extent_logical, u64 extent_len, +			       u64 *extent_physical, +			       struct btrfs_device **extent_dev, +			       int *extent_mirror_num); +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, +			      struct scrub_wr_ctx *wr_ctx, +			      struct btrfs_fs_info *fs_info, +			      struct btrfs_device *dev, +			      int is_dev_replace); +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage); +static void scrub_wr_submit(struct scrub_ctx *sctx); +static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static int write_page_nocow(struct scrub_ctx *sctx, +			    u64 physical_for_dev_replace, struct page *page); +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, +				      void *ctx); +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +			    int mirror_num, u64 physical_for_dev_replace); +static void copy_nocow_pages_worker(struct btrfs_work *work); + + +static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +{ +	atomic_inc(&sctx->bios_in_flight); +} + +static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +{ +	atomic_dec(&sctx->bios_in_flight); +	wake_up(&sctx->list_wait); +} + +/* + * used for workers that require transaction commits (i.e., for the + * NOCOW case) + */ +static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) +{ +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + +	/* +	 * increment scrubs_running to prevent cancel requests from +	 * completing as long as a worker is running. we must also +	 * increment scrubs_paused to prevent deadlocking on pause +	 * requests used for transactions commits (as the worker uses a +	 * transaction context). it is safe to regard the worker +	 * as paused for all matters practical. effectively, we only +	 * avoid cancellation requests from completing. +	 */ +	mutex_lock(&fs_info->scrub_lock); +	atomic_inc(&fs_info->scrubs_running); +	atomic_inc(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	atomic_inc(&sctx->workers_pending); +} +/* used for workers that require transaction commits */ +static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) +{ +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; -static void scrub_free_csums(struct scrub_dev *sdev) +	/* +	 * see scrub_pending_trans_workers_inc() why we're pretending +	 * to be paused in the scrub counters +	 */ +	mutex_lock(&fs_info->scrub_lock); +	atomic_dec(&fs_info->scrubs_running); +	atomic_dec(&fs_info->scrubs_paused); +	mutex_unlock(&fs_info->scrub_lock); +	atomic_dec(&sctx->workers_pending); +	wake_up(&fs_info->scrub_pause_wait); +	wake_up(&sctx->list_wait); +} + +static void scrub_free_csums(struct scrub_ctx *sctx)  { -	while (!list_empty(&sdev->csum_list)) { +	while (!list_empty(&sctx->csum_list)) {  		struct btrfs_ordered_sum *sum; -		sum = list_first_entry(&sdev->csum_list, +		sum = list_first_entry(&sctx->csum_list,  				       struct btrfs_ordered_sum, list);  		list_del(&sum->list);  		kfree(sum);  	}  } -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) +static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)  {  	int i; -	if (!sdev) +	if (!sctx)  		return; +	scrub_free_wr_ctx(&sctx->wr_ctx); +  	/* this can happen when scrub is cancelled */ -	if (sdev->curr != -1) { -		struct scrub_bio *sbio = sdev->bios[sdev->curr]; +	if (sctx->curr != -1) { +		struct scrub_bio *sbio = sctx->bios[sctx->curr];  		for (i = 0; i < sbio->page_count; i++) { -			BUG_ON(!sbio->pagev[i]); -			BUG_ON(!sbio->pagev[i]->page); +			WARN_ON(!sbio->pagev[i]->page);  			scrub_block_put(sbio->pagev[i]->sblock);  		}  		bio_put(sbio->bio);  	} -	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { -		struct scrub_bio *sbio = sdev->bios[i]; +	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { +		struct scrub_bio *sbio = sctx->bios[i];  		if (!sbio)  			break;  		kfree(sbio);  	} -	scrub_free_csums(sdev); -	kfree(sdev); +	scrub_free_csums(sctx); +	kfree(sctx);  }  static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) +struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)  { -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	int		i;  	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; -	int pages_per_bio; +	int pages_per_rd_bio; +	int ret; -	pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, -			      bio_get_nr_vecs(dev->bdev)); -	sdev = kzalloc(sizeof(*sdev), GFP_NOFS); -	if (!sdev) +	/* +	 * the setting of pages_per_rd_bio is correct for scrub but might +	 * be wrong for the dev_replace code where we might read from +	 * different devices in the initial huge bios. However, that +	 * code is able to correctly handle the case when adding a page +	 * to a bio fails. +	 */ +	if (dev->bdev) +		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, +					 bio_get_nr_vecs(dev->bdev)); +	else +		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; +	sctx = kzalloc(sizeof(*sctx), GFP_NOFS); +	if (!sctx)  		goto nomem; -	sdev->dev = dev; -	sdev->pages_per_bio = pages_per_bio; -	sdev->curr = -1; -	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { +	sctx->is_dev_replace = is_dev_replace; +	sctx->pages_per_rd_bio = pages_per_rd_bio; +	sctx->curr = -1; +	sctx->dev_root = dev->dev_root; +	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {  		struct scrub_bio *sbio;  		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);  		if (!sbio)  			goto nomem; -		sdev->bios[i] = sbio; +		sctx->bios[i] = sbio;  		sbio->index = i; -		sbio->sdev = sdev; +		sbio->sctx = sctx;  		sbio->page_count = 0;  		sbio->work.func = scrub_bio_end_io_worker; -		if (i != SCRUB_BIOS_PER_DEV-1) -			sdev->bios[i]->next_free = i + 1; +		if (i != SCRUB_BIOS_PER_SCTX - 1) +			sctx->bios[i]->next_free = i + 1;  		else -			sdev->bios[i]->next_free = -1; -	} -	sdev->first_free = 0; -	sdev->nodesize = dev->dev_root->nodesize; -	sdev->leafsize = dev->dev_root->leafsize; -	sdev->sectorsize = dev->dev_root->sectorsize; -	atomic_set(&sdev->in_flight, 0); -	atomic_set(&sdev->fixup_cnt, 0); -	atomic_set(&sdev->cancel_req, 0); -	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); -	INIT_LIST_HEAD(&sdev->csum_list); - -	spin_lock_init(&sdev->list_lock); -	spin_lock_init(&sdev->stat_lock); -	init_waitqueue_head(&sdev->list_wait); -	return sdev; +			sctx->bios[i]->next_free = -1; +	} +	sctx->first_free = 0; +	sctx->nodesize = dev->dev_root->nodesize; +	sctx->leafsize = dev->dev_root->leafsize; +	sctx->sectorsize = dev->dev_root->sectorsize; +	atomic_set(&sctx->bios_in_flight, 0); +	atomic_set(&sctx->workers_pending, 0); +	atomic_set(&sctx->cancel_req, 0); +	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); +	INIT_LIST_HEAD(&sctx->csum_list); + +	spin_lock_init(&sctx->list_lock); +	spin_lock_init(&sctx->stat_lock); +	init_waitqueue_head(&sctx->list_wait); + +	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, +				 fs_info->dev_replace.tgtdev, is_dev_replace); +	if (ret) { +		scrub_free_ctx(sctx); +		return ERR_PTR(ret); +	} +	return sctx;  nomem: -	scrub_free_dev(sdev); +	scrub_free_ctx(sctx);  	return ERR_PTR(-ENOMEM);  } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, +				     void *warn_ctx)  {  	u64 isize;  	u32 nlink; @@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)  	int i;  	struct extent_buffer *eb;  	struct btrfs_inode_item *inode_item; -	struct scrub_warning *swarn = ctx; +	struct scrub_warning *swarn = warn_ctx;  	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;  	struct inode_fs_paths *ipath = NULL;  	struct btrfs_root *local_root; @@ -345,8 +496,8 @@ err:  static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  { -	struct btrfs_device *dev = sblock->sdev->dev; -	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; +	struct btrfs_device *dev; +	struct btrfs_fs_info *fs_info;  	struct btrfs_path *path;  	struct btrfs_key found_key;  	struct extent_buffer *eb; @@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  	const int bufsize = 4096;  	int ret; +	WARN_ON(sblock->page_count < 1); +	dev = sblock->pagev[0]->dev; +	fs_info = sblock->sctx->dev_root->fs_info; +  	path = btrfs_alloc_path();  	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);  	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); -	BUG_ON(sblock->page_count < 1); -	swarn.sector = (sblock->pagev[0].physical) >> 9; -	swarn.logical = sblock->pagev[0].logical; +	swarn.sector = (sblock->pagev[0]->physical) >> 9; +	swarn.logical = sblock->pagev[0]->logical;  	swarn.errstr = errstr; -	swarn.dev = dev; +	swarn.dev = NULL;  	swarn.msg_bufsize = bufsize;  	swarn.scratch_bufsize = bufsize; @@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)  		} while (ret != 1);  	} else {  		swarn.path = path; +		swarn.dev = dev;  		iterate_extent_inodes(fs_info, found_key.objectid,  					extent_item_pos, 1,  					scrub_print_warning_inode, &swarn); @@ -416,11 +571,11 @@ out:  	kfree(swarn.msg_buf);  } -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)  {  	struct page *page = NULL;  	unsigned long index; -	struct scrub_fixup_nodatasum *fixup = ctx; +	struct scrub_fixup_nodatasum *fixup = fixup_ctx;  	int ret;  	int corrected = 0;  	struct btrfs_key key; @@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)  	}  	if (PageUptodate(page)) { -		struct btrfs_mapping_tree *map_tree; +		struct btrfs_fs_info *fs_info;  		if (PageDirty(page)) {  			/*  			 * we need to write the data to the defect sector. the @@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)  			ret = -EIO;  			goto out;  		} -		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; -		ret = repair_io_failure(map_tree, offset, PAGE_SIZE, +		fs_info = BTRFS_I(inode)->root->fs_info; +		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,  					fixup->logical, page,  					fixup->mirror_num);  		unlock_page(page); @@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)  {  	int ret;  	struct scrub_fixup_nodatasum *fixup; -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	struct btrfs_trans_handle *trans = NULL;  	struct btrfs_fs_info *fs_info;  	struct btrfs_path *path;  	int uncorrectable = 0;  	fixup = container_of(work, struct scrub_fixup_nodatasum, work); -	sdev = fixup->sdev; +	sctx = fixup->sctx;  	fs_info = fixup->root->fs_info;  	path = btrfs_alloc_path();  	if (!path) { -		spin_lock(&sdev->stat_lock); -		++sdev->stat.malloc_errors; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		++sctx->stat.malloc_errors; +		spin_unlock(&sctx->stat_lock);  		uncorrectable = 1;  		goto out;  	} @@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)  	}  	WARN_ON(ret != 1); -	spin_lock(&sdev->stat_lock); -	++sdev->stat.corrected_errors; -	spin_unlock(&sdev->stat_lock); +	spin_lock(&sctx->stat_lock); +	++sctx->stat.corrected_errors; +	spin_unlock(&sctx->stat_lock);  out:  	if (trans && !IS_ERR(trans))  		btrfs_end_transaction(trans, fixup->root);  	if (uncorrectable) { -		spin_lock(&sdev->stat_lock); -		++sdev->stat.uncorrectable_errors; -		spin_unlock(&sdev->stat_lock); - +		spin_lock(&sctx->stat_lock); +		++sctx->stat.uncorrectable_errors; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_replace_stats_inc( +			&sctx->dev_root->fs_info->dev_replace. +			num_uncorrectable_read_errors);  		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",  			(unsigned long long)fixup->logical, -			rcu_str_deref(sdev->dev->name)); +			rcu_str_deref(fixup->dev->name));  	}  	btrfs_free_path(path);  	kfree(fixup); -	/* see caller why we're pretending to be paused in the scrub counters */ -	mutex_lock(&fs_info->scrub_lock); -	atomic_dec(&fs_info->scrubs_running); -	atomic_dec(&fs_info->scrubs_paused); -	mutex_unlock(&fs_info->scrub_lock); -	atomic_dec(&sdev->fixup_cnt); -	wake_up(&fs_info->scrub_pause_wait); -	wake_up(&sdev->list_wait); +	scrub_pending_trans_workers_dec(sctx);  }  /* @@ -614,7 +764,8 @@ out:   */  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  { -	struct scrub_dev *sdev = sblock_to_check->sdev; +	struct scrub_ctx *sctx = sblock_to_check->sctx; +	struct btrfs_device *dev;  	struct btrfs_fs_info *fs_info;  	u64 length;  	u64 logical; @@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				      DEFAULT_RATELIMIT_BURST);  	BUG_ON(sblock_to_check->page_count < 1); -	fs_info = sdev->dev->dev_root->fs_info; +	fs_info = sctx->dev_root->fs_info; +	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { +		/* +		 * if we find an error in a super block, we just report it. +		 * They will get written with the next transaction commit +		 * anyway +		 */ +		spin_lock(&sctx->stat_lock); +		++sctx->stat.super_errors; +		spin_unlock(&sctx->stat_lock); +		return 0; +	}  	length = sblock_to_check->page_count * PAGE_SIZE; -	logical = sblock_to_check->pagev[0].logical; -	generation = sblock_to_check->pagev[0].generation; -	BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); -	failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; -	is_metadata = !(sblock_to_check->pagev[0].flags & +	logical = sblock_to_check->pagev[0]->logical; +	generation = sblock_to_check->pagev[0]->generation; +	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); +	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; +	is_metadata = !(sblock_to_check->pagev[0]->flags &  			BTRFS_EXTENT_FLAG_DATA); -	have_csum = sblock_to_check->pagev[0].have_csum; -	csum = sblock_to_check->pagev[0].csum; +	have_csum = sblock_to_check->pagev[0]->have_csum; +	csum = sblock_to_check->pagev[0]->csum; +	dev = sblock_to_check->pagev[0]->dev; + +	if (sctx->is_dev_replace && !is_metadata && !have_csum) { +		sblocks_for_recheck = NULL; +		goto nodatasum_case; +	}  	/*  	 * read all mirrors one after the other. This includes to @@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				     sizeof(*sblocks_for_recheck),  				     GFP_NOFS);  	if (!sblocks_for_recheck) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.malloc_errors++; -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		sctx->stat.read_errors++; +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	}  	/* setup the context, map the logical blocks and alloc the pages */ -	ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, +	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,  					logical, sblocks_for_recheck);  	if (ret) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		spin_lock(&sctx->stat_lock); +		sctx->stat.read_errors++; +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  		goto out;  	}  	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);  	sblock_bad = sblocks_for_recheck + failed_mirror_index;  	/* build and submit the bios for the failed mirror, check checksums */ -	ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, -				  csum, generation, sdev->csum_size); -	if (ret) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); -		goto out; -	} +	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, +			    csum, generation, sctx->csum_size);  	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&  	    sblock_bad->no_io_error_seen) { @@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		 * different bio (usually one of the two latter cases is  		 * the cause)  		 */ -		spin_lock(&sdev->stat_lock); -		sdev->stat.unverified_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.unverified_errors++; +		spin_unlock(&sctx->stat_lock); +		if (sctx->is_dev_replace) +			scrub_write_block_to_dev_replace(sblock_bad);  		goto out;  	}  	if (!sblock_bad->no_io_error_seen) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.read_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.read_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("i/o error", sblock_to_check); -		btrfs_dev_stat_inc_and_print(sdev->dev, -					     BTRFS_DEV_STAT_READ_ERRS); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  	} else if (sblock_bad->checksum_error) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.csum_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.csum_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum error", sblock_to_check); -		btrfs_dev_stat_inc_and_print(sdev->dev, +		btrfs_dev_stat_inc_and_print(dev,  					     BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} else if (sblock_bad->header_error) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.verify_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.verify_errors++; +		spin_unlock(&sctx->stat_lock);  		if (__ratelimit(&_rs))  			scrub_print_warning("checksum/header error",  					    sblock_to_check);  		if (sblock_bad->generation_error) -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(dev,  				BTRFS_DEV_STAT_GENERATION_ERRS);  		else -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(dev,  				BTRFS_DEV_STAT_CORRUPTION_ERRS);  	} -	if (sdev->readonly) +	if (sctx->readonly && !sctx->is_dev_replace)  		goto did_not_correct_error;  	if (!is_metadata && !have_csum) {  		struct scrub_fixup_nodatasum *fixup_nodatasum; +nodatasum_case: +		WARN_ON(sctx->is_dev_replace); +  		/*  		 * !is_metadata and !have_csum, this means that the data  		 * might not be COW'ed, that it might be modified @@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);  		if (!fixup_nodatasum)  			goto did_not_correct_error; -		fixup_nodatasum->sdev = sdev; +		fixup_nodatasum->sctx = sctx; +		fixup_nodatasum->dev = dev;  		fixup_nodatasum->logical = logical;  		fixup_nodatasum->root = fs_info->extent_root;  		fixup_nodatasum->mirror_num = failed_mirror_index + 1; -		/* -		 * increment scrubs_running to prevent cancel requests from -		 * completing as long as a fixup worker is running. we must also -		 * increment scrubs_paused to prevent deadlocking on pause -		 * requests used for transactions commits (as the worker uses a -		 * transaction context). it is safe to regard the fixup worker -		 * as paused for all matters practical. effectively, we only -		 * avoid cancellation requests from completing. -		 */ -		mutex_lock(&fs_info->scrub_lock); -		atomic_inc(&fs_info->scrubs_running); -		atomic_inc(&fs_info->scrubs_paused); -		mutex_unlock(&fs_info->scrub_lock); -		atomic_inc(&sdev->fixup_cnt); +		scrub_pending_trans_workers_inc(sctx);  		fixup_nodatasum->work.func = scrub_fixup_nodatasum;  		btrfs_queue_worker(&fs_info->scrub_workers,  				   &fixup_nodatasum->work); @@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	/*  	 * now build and submit the bios for the other mirrors, check -	 * checksums -	 */ -	for (mirror_index = 0; -	     mirror_index < BTRFS_MAX_MIRRORS && -	     sblocks_for_recheck[mirror_index].page_count > 0; -	     mirror_index++) { -		if (mirror_index == failed_mirror_index) -			continue; - -		/* build and submit the bios, check checksums */ -		ret = scrub_recheck_block(fs_info, -					  sblocks_for_recheck + mirror_index, -					  is_metadata, have_csum, csum, -					  generation, sdev->csum_size); -		if (ret) -			goto did_not_correct_error; -	} - -	/* -	 * first try to pick the mirror which is completely without I/O +	 * checksums. +	 * First try to pick the mirror which is completely without I/O  	 * errors and also does not have a checksum error.  	 * If one is found, and if a checksum is present, the full block  	 * that is known to contain an error is rewritten. Afterwards @@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	     mirror_index < BTRFS_MAX_MIRRORS &&  	     sblocks_for_recheck[mirror_index].page_count > 0;  	     mirror_index++) { -		struct scrub_block *sblock_other = sblocks_for_recheck + -						   mirror_index; +		struct scrub_block *sblock_other; + +		if (mirror_index == failed_mirror_index) +			continue; +		sblock_other = sblocks_for_recheck + mirror_index; + +		/* build and submit the bios, check checksums */ +		scrub_recheck_block(fs_info, sblock_other, is_metadata, +				    have_csum, csum, generation, +				    sctx->csum_size);  		if (!sblock_other->header_error &&  		    !sblock_other->checksum_error &&  		    sblock_other->no_io_error_seen) { -			int force_write = is_metadata || have_csum; - -			ret = scrub_repair_block_from_good_copy(sblock_bad, -								sblock_other, -								force_write); +			if (sctx->is_dev_replace) { +				scrub_write_block_to_dev_replace(sblock_other); +			} else { +				int force_write = is_metadata || have_csum; + +				ret = scrub_repair_block_from_good_copy( +						sblock_bad, sblock_other, +						force_write); +			}  			if (0 == ret)  				goto corrected_error;  		}  	}  	/* -	 * in case of I/O errors in the area that is supposed to be +	 * for dev_replace, pick good pages and write to the target device. +	 */ +	if (sctx->is_dev_replace) { +		success = 1; +		for (page_num = 0; page_num < sblock_bad->page_count; +		     page_num++) { +			int sub_success; + +			sub_success = 0; +			for (mirror_index = 0; +			     mirror_index < BTRFS_MAX_MIRRORS && +			     sblocks_for_recheck[mirror_index].page_count > 0; +			     mirror_index++) { +				struct scrub_block *sblock_other = +					sblocks_for_recheck + mirror_index; +				struct scrub_page *page_other = +					sblock_other->pagev[page_num]; + +				if (!page_other->io_error) { +					ret = scrub_write_page_to_dev_replace( +							sblock_other, page_num); +					if (ret == 0) { +						/* succeeded for this page */ +						sub_success = 1; +						break; +					} else { +						btrfs_dev_replace_stats_inc( +							&sctx->dev_root-> +							fs_info->dev_replace. +							num_write_errors); +					} +				} +			} + +			if (!sub_success) { +				/* +				 * did not find a mirror to fetch the page +				 * from. scrub_write_page_to_dev_replace() +				 * handles this case (page->io_error), by +				 * filling the block with zeros before +				 * submitting the write request +				 */ +				success = 0; +				ret = scrub_write_page_to_dev_replace( +						sblock_bad, page_num); +				if (ret) +					btrfs_dev_replace_stats_inc( +						&sctx->dev_root->fs_info-> +						dev_replace.num_write_errors); +			} +		} + +		goto out; +	} + +	/* +	 * for regular scrub, repair those pages that are errored. +	 * In case of I/O errors in the area that is supposed to be  	 * repaired, continue by picking good copies of those pages.  	 * Select the good pages from mirrors to rewrite bad pages from  	 * the area to fix. Afterwards verify the checksum of the block @@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  	success = 1;  	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { -		struct scrub_page *page_bad = sblock_bad->pagev + page_num; +		struct scrub_page *page_bad = sblock_bad->pagev[page_num];  		if (!page_bad->io_error)  			continue; @@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  		     mirror_index++) {  			struct scrub_block *sblock_other = sblocks_for_recheck +  							   mirror_index; -			struct scrub_page *page_other = sblock_other->pagev + -							page_num; +			struct scrub_page *page_other = sblock_other->pagev[ +							page_num];  			if (!page_other->io_error) {  				ret = scrub_repair_page_from_good_copy( @@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  			 * is verified, but most likely the data comes out  			 * of the page cache.  			 */ -			ret = scrub_recheck_block(fs_info, sblock_bad, -						  is_metadata, have_csum, csum, -						  generation, sdev->csum_size); -			if (!ret && !sblock_bad->header_error && +			scrub_recheck_block(fs_info, sblock_bad, +					    is_metadata, have_csum, csum, +					    generation, sctx->csum_size); +			if (!sblock_bad->header_error &&  			    !sblock_bad->checksum_error &&  			    sblock_bad->no_io_error_seen)  				goto corrected_error; @@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)  				goto did_not_correct_error;  		} else {  corrected_error: -			spin_lock(&sdev->stat_lock); -			sdev->stat.corrected_errors++; -			spin_unlock(&sdev->stat_lock); +			spin_lock(&sctx->stat_lock); +			sctx->stat.corrected_errors++; +			spin_unlock(&sctx->stat_lock);  			printk_ratelimited_in_rcu(KERN_ERR  				"btrfs: fixed up error at logical %llu on dev %s\n",  				(unsigned long long)logical, -				rcu_str_deref(sdev->dev->name)); +				rcu_str_deref(dev->name));  		}  	} else {  did_not_correct_error: -		spin_lock(&sdev->stat_lock); -		sdev->stat.uncorrectable_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.uncorrectable_errors++; +		spin_unlock(&sctx->stat_lock);  		printk_ratelimited_in_rcu(KERN_ERR  			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",  			(unsigned long long)logical, -			rcu_str_deref(sdev->dev->name)); +			rcu_str_deref(dev->name));  	}  out: @@ -966,11 +1166,11 @@ out:  						     mirror_index;  			int page_index; -			for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; -			     page_index++) -				if (sblock->pagev[page_index].page) -					__free_page( -						sblock->pagev[page_index].page); +			for (page_index = 0; page_index < sblock->page_count; +			     page_index++) { +				sblock->pagev[page_index]->sblock = NULL; +				scrub_page_put(sblock->pagev[page_index]); +			}  		}  		kfree(sblocks_for_recheck);  	} @@ -978,8 +1178,9 @@ out:  	return 0;  } -static int scrub_setup_recheck_block(struct scrub_dev *sdev, -				     struct btrfs_mapping_tree *map_tree, +static int scrub_setup_recheck_block(struct scrub_ctx *sctx, +				     struct btrfs_fs_info *fs_info, +				     struct scrub_block *original_sblock,  				     u64 length, u64 logical,  				     struct scrub_block *sblocks_for_recheck)  { @@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  	int ret;  	/* -	 * note: the three members sdev, ref_count and outstanding_pages +	 * note: the two members ref_count and outstanding_pages  	 * are not used (and not set) in the blocks that are used for  	 * the recheck procedure  	 */ @@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  		 * with a length of PAGE_SIZE, each returned stripe  		 * represents one mirror  		 */ -		ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, -				      &bbio, 0); +		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, +				      &mapped_length, &bbio, 0);  		if (ret || !bbio || mapped_length < sublen) {  			kfree(bbio);  			return -EIO;  		} -		BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); +		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);  		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;  		     mirror_index++) {  			struct scrub_block *sblock; @@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,  				continue;  			sblock = sblocks_for_recheck + mirror_index; -			page = sblock->pagev + page_index; +			sblock->sctx = sctx; +			page = kzalloc(sizeof(*page), GFP_NOFS); +			if (!page) { +leave_nomem: +				spin_lock(&sctx->stat_lock); +				sctx->stat.malloc_errors++; +				spin_unlock(&sctx->stat_lock); +				kfree(bbio); +				return -ENOMEM; +			} +			scrub_page_get(page); +			sblock->pagev[page_index] = page;  			page->logical = logical;  			page->physical = bbio->stripes[mirror_index].physical; +			BUG_ON(page_index >= original_sblock->page_count); +			page->physical_for_dev_replace = +				original_sblock->pagev[page_index]-> +				physical_for_dev_replace;  			/* for missing devices, dev->bdev is NULL */  			page->dev = bbio->stripes[mirror_index].dev;  			page->mirror_num = mirror_index + 1; -			page->page = alloc_page(GFP_NOFS); -			if (!page->page) { -				spin_lock(&sdev->stat_lock); -				sdev->stat.malloc_errors++; -				spin_unlock(&sdev->stat_lock); -				kfree(bbio); -				return -ENOMEM; -			}  			sblock->page_count++; +			page->page = alloc_page(GFP_NOFS); +			if (!page->page) +				goto leave_nomem;  		}  		kfree(bbio);  		length -= sublen; @@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,   * to take those pages that are not errored from all the mirrors so that   * the pages that are errored in the just handled mirror can be repaired.   */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, -			       struct scrub_block *sblock, int is_metadata, -			       int have_csum, u8 *csum, u64 generation, -			       u16 csum_size) +static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +				struct scrub_block *sblock, int is_metadata, +				int have_csum, u8 *csum, u64 generation, +				u16 csum_size)  {  	int page_num; @@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  	for (page_num = 0; page_num < sblock->page_count; page_num++) {  		struct bio *bio; -		int ret; -		struct scrub_page *page = sblock->pagev + page_num; +		struct scrub_page *page = sblock->pagev[page_num];  		DECLARE_COMPLETION_ONSTACK(complete);  		if (page->dev->bdev == NULL) { @@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  			continue;  		} -		BUG_ON(!page->page); +		WARN_ON(!page->page);  		bio = bio_alloc(GFP_NOFS, 1); -		if (!bio) -			return -EIO; +		if (!bio) { +			page->io_error = 1; +			sblock->no_io_error_seen = 0; +			continue; +		}  		bio->bi_bdev = page->dev->bdev;  		bio->bi_sector = page->physical >> 9;  		bio->bi_end_io = scrub_complete_bio_end_io;  		bio->bi_private = &complete; -		ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); -		if (PAGE_SIZE != ret) { -			bio_put(bio); -			return -EIO; -		} +		bio_add_page(bio, page->page, PAGE_SIZE, 0);  		btrfsic_submit_bio(READ, bio);  		/* this will also unplug the queue */ @@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,  					     have_csum, csum, generation,  					     csum_size); -	return 0; +	return;  }  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, @@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  	struct btrfs_root *root = fs_info->extent_root;  	void *mapped_buffer; -	BUG_ON(!sblock->pagev[0].page); +	WARN_ON(!sblock->pagev[0]->page);  	if (is_metadata) {  		struct btrfs_header *h; -		mapped_buffer = kmap_atomic(sblock->pagev[0].page); +		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);  		h = (struct btrfs_header *)mapped_buffer; -		if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || +		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||  		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||  		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,  			   BTRFS_UUID_SIZE)) { @@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		if (!have_csum)  			return; -		mapped_buffer = kmap_atomic(sblock->pagev[0].page); +		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);  	}  	for (page_num = 0;;) { @@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,  		page_num++;  		if (page_num >= sblock->page_count)  			break; -		BUG_ON(!sblock->pagev[page_num].page); +		WARN_ON(!sblock->pagev[page_num]->page); -		mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); +		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);  	}  	btrfs_csum_final(crc, calculated_csum); @@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  					    struct scrub_block *sblock_good,  					    int page_num, int force_write)  { -	struct scrub_page *page_bad = sblock_bad->pagev + page_num; -	struct scrub_page *page_good = sblock_good->pagev + page_num; +	struct scrub_page *page_bad = sblock_bad->pagev[page_num]; +	struct scrub_page *page_good = sblock_good->pagev[page_num]; -	BUG_ON(sblock_bad->pagev[page_num].page == NULL); -	BUG_ON(sblock_good->pagev[page_num].page == NULL); +	BUG_ON(page_bad->page == NULL); +	BUG_ON(page_good->page == NULL);  	if (force_write || sblock_bad->header_error ||  	    sblock_bad->checksum_error || page_bad->io_error) {  		struct bio *bio;  		int ret;  		DECLARE_COMPLETION_ONSTACK(complete); +		if (!page_bad->dev->bdev) { +			printk_ratelimited(KERN_WARNING +				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); +			return -EIO; +		} +  		bio = bio_alloc(GFP_NOFS, 1);  		if (!bio)  			return -EIO; @@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  		if (!bio_flagged(bio, BIO_UPTODATE)) {  			btrfs_dev_stat_inc_and_print(page_bad->dev,  				BTRFS_DEV_STAT_WRITE_ERRS); +			btrfs_dev_replace_stats_inc( +				&sblock_bad->sctx->dev_root->fs_info-> +				dev_replace.num_write_errors);  			bio_put(bio);  			return -EIO;  		} @@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,  	return 0;  } -static void scrub_checksum(struct scrub_block *sblock) +static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +{ +	int page_num; + +	for (page_num = 0; page_num < sblock->page_count; page_num++) { +		int ret; + +		ret = scrub_write_page_to_dev_replace(sblock, page_num); +		if (ret) +			btrfs_dev_replace_stats_inc( +				&sblock->sctx->dev_root->fs_info->dev_replace. +				num_write_errors); +	} +} + +static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, +					   int page_num) +{ +	struct scrub_page *spage = sblock->pagev[page_num]; + +	BUG_ON(spage->page == NULL); +	if (spage->io_error) { +		void *mapped_buffer = kmap_atomic(spage->page); + +		memset(mapped_buffer, 0, PAGE_CACHE_SIZE); +		flush_dcache_page(spage->page); +		kunmap_atomic(mapped_buffer); +	} +	return scrub_add_page_to_wr_bio(sblock->sctx, spage); +} + +static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage) +{ +	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; +	struct scrub_bio *sbio; +	int ret; + +	mutex_lock(&wr_ctx->wr_lock); +again: +	if (!wr_ctx->wr_curr_bio) { +		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), +					      GFP_NOFS); +		if (!wr_ctx->wr_curr_bio) { +			mutex_unlock(&wr_ctx->wr_lock); +			return -ENOMEM; +		} +		wr_ctx->wr_curr_bio->sctx = sctx; +		wr_ctx->wr_curr_bio->page_count = 0; +	} +	sbio = wr_ctx->wr_curr_bio; +	if (sbio->page_count == 0) { +		struct bio *bio; + +		sbio->physical = spage->physical_for_dev_replace; +		sbio->logical = spage->logical; +		sbio->dev = wr_ctx->tgtdev; +		bio = sbio->bio; +		if (!bio) { +			bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); +			if (!bio) { +				mutex_unlock(&wr_ctx->wr_lock); +				return -ENOMEM; +			} +			sbio->bio = bio; +		} + +		bio->bi_private = sbio; +		bio->bi_end_io = scrub_wr_bio_end_io; +		bio->bi_bdev = sbio->dev->bdev; +		bio->bi_sector = sbio->physical >> 9; +		sbio->err = 0; +	} else if (sbio->physical + sbio->page_count * PAGE_SIZE != +		   spage->physical_for_dev_replace || +		   sbio->logical + sbio->page_count * PAGE_SIZE != +		   spage->logical) { +		scrub_wr_submit(sctx); +		goto again; +	} + +	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); +	if (ret != PAGE_SIZE) { +		if (sbio->page_count < 1) { +			bio_put(sbio->bio); +			sbio->bio = NULL; +			mutex_unlock(&wr_ctx->wr_lock); +			return -EIO; +		} +		scrub_wr_submit(sctx); +		goto again; +	} + +	sbio->pagev[sbio->page_count] = spage; +	scrub_page_get(spage); +	sbio->page_count++; +	if (sbio->page_count == wr_ctx->pages_per_wr_bio) +		scrub_wr_submit(sctx); +	mutex_unlock(&wr_ctx->wr_lock); + +	return 0; +} + +static void scrub_wr_submit(struct scrub_ctx *sctx) +{ +	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; +	struct scrub_bio *sbio; + +	if (!wr_ctx->wr_curr_bio) +		return; + +	sbio = wr_ctx->wr_curr_bio; +	wr_ctx->wr_curr_bio = NULL; +	WARN_ON(!sbio->bio->bi_bdev); +	scrub_pending_bio_inc(sctx); +	/* process all writes in a single worker thread. Then the block layer +	 * orders the requests before sending them to the driver which +	 * doubled the write performance on spinning disks when measured +	 * with Linux 3.5 */ +	btrfsic_submit_bio(WRITE, sbio->bio); +} + +static void scrub_wr_bio_end_io(struct bio *bio, int err) +{ +	struct scrub_bio *sbio = bio->bi_private; +	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + +	sbio->err = err; +	sbio->bio = bio; + +	sbio->work.func = scrub_wr_bio_end_io_worker; +	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); +} + +static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +{ +	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); +	struct scrub_ctx *sctx = sbio->sctx; +	int i; + +	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); +	if (sbio->err) { +		struct btrfs_dev_replace *dev_replace = +			&sbio->sctx->dev_root->fs_info->dev_replace; + +		for (i = 0; i < sbio->page_count; i++) { +			struct scrub_page *spage = sbio->pagev[i]; + +			spage->io_error = 1; +			btrfs_dev_replace_stats_inc(&dev_replace-> +						    num_write_errors); +		} +	} + +	for (i = 0; i < sbio->page_count; i++) +		scrub_page_put(sbio->pagev[i]); + +	bio_put(sbio->bio); +	kfree(sbio); +	scrub_pending_bio_dec(sctx); +} + +static int scrub_checksum(struct scrub_block *sblock)  {  	u64 flags;  	int ret; -	BUG_ON(sblock->page_count < 1); -	flags = sblock->pagev[0].flags; +	WARN_ON(sblock->page_count < 1); +	flags = sblock->pagev[0]->flags;  	ret = 0;  	if (flags & BTRFS_EXTENT_FLAG_DATA)  		ret = scrub_checksum_data(sblock); @@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)  		WARN_ON(1);  	if (ret)  		scrub_handle_errored_block(sblock); + +	return ret;  }  static int scrub_checksum_data(struct scrub_block *sblock)  { -	struct scrub_dev *sdev = sblock->sdev; +	struct scrub_ctx *sctx = sblock->sctx;  	u8 csum[BTRFS_CSUM_SIZE];  	u8 *on_disk_csum;  	struct page *page;  	void *buffer;  	u32 crc = ~(u32)0;  	int fail = 0; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	u64 len;  	int index;  	BUG_ON(sblock->page_count < 1); -	if (!sblock->pagev[0].have_csum) +	if (!sblock->pagev[0]->have_csum)  		return 0; -	on_disk_csum = sblock->pagev[0].csum; -	page = sblock->pagev[0].page; +	on_disk_csum = sblock->pagev[0]->csum; +	page = sblock->pagev[0]->page;  	buffer = kmap_atomic(page); -	len = sdev->sectorsize; +	len = sctx->sectorsize;  	index = 0;  	for (;;) {  		u64 l = min_t(u64, len, PAGE_SIZE); @@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		buffer = kmap_atomic(page);  	}  	btrfs_csum_final(crc, csum); -	if (memcmp(csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(csum, on_disk_csum, sctx->csum_size))  		fail = 1;  	return fail; @@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)  static int scrub_checksum_tree_block(struct scrub_block *sblock)  { -	struct scrub_dev *sdev = sblock->sdev; +	struct scrub_ctx *sctx = sblock->sctx;  	struct btrfs_header *h; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u8 calculated_csum[BTRFS_CSUM_SIZE];  	u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	int index;  	BUG_ON(sblock->page_count < 1); -	page = sblock->pagev[0].page; +	page = sblock->pagev[0]->page;  	mapped_buffer = kmap_atomic(page);  	h = (struct btrfs_header *)mapped_buffer; -	memcpy(on_disk_csum, h->csum, sdev->csum_size); +	memcpy(on_disk_csum, h->csum, sctx->csum_size);  	/*  	 * we don't use the getter functions here, as we @@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  	 * b) the page is already kmapped  	 */ -	if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) +	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))  		++fail; -	if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) +	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))  		++fail;  	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  		   BTRFS_UUID_SIZE))  		++fail; -	BUG_ON(sdev->nodesize != sdev->leafsize); -	len = sdev->nodesize - BTRFS_CSUM_SIZE; +	WARN_ON(sctx->nodesize != sctx->leafsize); +	len = sctx->nodesize - BTRFS_CSUM_SIZE;  	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;  	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;  	index = 0; @@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		mapped_buffer = kmap_atomic(page);  		mapped_size = PAGE_SIZE;  		p = mapped_buffer;  	}  	btrfs_csum_final(crc, calculated_csum); -	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))  		++crc_fail;  	return fail || crc_fail; @@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)  static int scrub_checksum_super(struct scrub_block *sblock)  {  	struct btrfs_super_block *s; -	struct scrub_dev *sdev = sblock->sdev; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct scrub_ctx *sctx = sblock->sctx; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u8 calculated_csum[BTRFS_CSUM_SIZE];  	u8 on_disk_csum[BTRFS_CSUM_SIZE]; @@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)  	int index;  	BUG_ON(sblock->page_count < 1); -	page = sblock->pagev[0].page; +	page = sblock->pagev[0]->page;  	mapped_buffer = kmap_atomic(page);  	s = (struct btrfs_super_block *)mapped_buffer; -	memcpy(on_disk_csum, s->csum, sdev->csum_size); +	memcpy(on_disk_csum, s->csum, sctx->csum_size); -	if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) +	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))  		++fail_cor; -	if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) +	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))  		++fail_gen;  	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)  			break;  		index++;  		BUG_ON(index >= sblock->page_count); -		BUG_ON(!sblock->pagev[index].page); -		page = sblock->pagev[index].page; +		BUG_ON(!sblock->pagev[index]->page); +		page = sblock->pagev[index]->page;  		mapped_buffer = kmap_atomic(page);  		mapped_size = PAGE_SIZE;  		p = mapped_buffer;  	}  	btrfs_csum_final(crc, calculated_csum); -	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) +	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))  		++fail_cor;  	if (fail_cor + fail_gen) { @@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)  		 * They will get written with the next transaction commit  		 * anyway  		 */ -		spin_lock(&sdev->stat_lock); -		++sdev->stat.super_errors; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		++sctx->stat.super_errors; +		spin_unlock(&sctx->stat_lock);  		if (fail_cor) -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,  				BTRFS_DEV_STAT_CORRUPTION_ERRS);  		else -			btrfs_dev_stat_inc_and_print(sdev->dev, +			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,  				BTRFS_DEV_STAT_GENERATION_ERRS);  	} @@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)  		int i;  		for (i = 0; i < sblock->page_count; i++) -			if (sblock->pagev[i].page) -				__free_page(sblock->pagev[i].page); +			scrub_page_put(sblock->pagev[i]);  		kfree(sblock);  	}  } -static void scrub_submit(struct scrub_dev *sdev) +static void scrub_page_get(struct scrub_page *spage) +{ +	atomic_inc(&spage->ref_count); +} + +static void scrub_page_put(struct scrub_page *spage) +{ +	if (atomic_dec_and_test(&spage->ref_count)) { +		if (spage->page) +			__free_page(spage->page); +		kfree(spage); +	} +} + +static void scrub_submit(struct scrub_ctx *sctx)  {  	struct scrub_bio *sbio; -	if (sdev->curr == -1) +	if (sctx->curr == -1)  		return; -	sbio = sdev->bios[sdev->curr]; -	sdev->curr = -1; -	atomic_inc(&sdev->in_flight); +	sbio = sctx->bios[sctx->curr]; +	sctx->curr = -1; +	scrub_pending_bio_inc(sctx); -	btrfsic_submit_bio(READ, sbio->bio); +	if (!sbio->bio->bi_bdev) { +		/* +		 * this case should not happen. If btrfs_map_block() is +		 * wrong, it could happen for dev-replace operations on +		 * missing devices when no mirrors are available, but in +		 * this case it should already fail the mount. +		 * This case is handled correctly (but _very_ slowly). +		 */ +		printk_ratelimited(KERN_WARNING +			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); +		bio_endio(sbio->bio, -EIO); +	} else { +		btrfsic_submit_bio(READ, sbio->bio); +	}  } -static int scrub_add_page_to_bio(struct scrub_dev *sdev, -				 struct scrub_page *spage) +static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, +				    struct scrub_page *spage)  {  	struct scrub_block *sblock = spage->sblock;  	struct scrub_bio *sbio; @@ -1494,28 +1901,29 @@ again:  	/*  	 * grab a fresh bio or wait for one to become available  	 */ -	while (sdev->curr == -1) { -		spin_lock(&sdev->list_lock); -		sdev->curr = sdev->first_free; -		if (sdev->curr != -1) { -			sdev->first_free = sdev->bios[sdev->curr]->next_free; -			sdev->bios[sdev->curr]->next_free = -1; -			sdev->bios[sdev->curr]->page_count = 0; -			spin_unlock(&sdev->list_lock); +	while (sctx->curr == -1) { +		spin_lock(&sctx->list_lock); +		sctx->curr = sctx->first_free; +		if (sctx->curr != -1) { +			sctx->first_free = sctx->bios[sctx->curr]->next_free; +			sctx->bios[sctx->curr]->next_free = -1; +			sctx->bios[sctx->curr]->page_count = 0; +			spin_unlock(&sctx->list_lock);  		} else { -			spin_unlock(&sdev->list_lock); -			wait_event(sdev->list_wait, sdev->first_free != -1); +			spin_unlock(&sctx->list_lock); +			wait_event(sctx->list_wait, sctx->first_free != -1);  		}  	} -	sbio = sdev->bios[sdev->curr]; +	sbio = sctx->bios[sctx->curr];  	if (sbio->page_count == 0) {  		struct bio *bio;  		sbio->physical = spage->physical;  		sbio->logical = spage->logical; +		sbio->dev = spage->dev;  		bio = sbio->bio;  		if (!bio) { -			bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); +			bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);  			if (!bio)  				return -ENOMEM;  			sbio->bio = bio; @@ -1523,14 +1931,15 @@ again:  		bio->bi_private = sbio;  		bio->bi_end_io = scrub_bio_end_io; -		bio->bi_bdev = sdev->dev->bdev; -		bio->bi_sector = spage->physical >> 9; +		bio->bi_bdev = sbio->dev->bdev; +		bio->bi_sector = sbio->physical >> 9;  		sbio->err = 0;  	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=  		   spage->physical ||  		   sbio->logical + sbio->page_count * PAGE_SIZE != -		   spage->logical) { -		scrub_submit(sdev); +		   spage->logical || +		   sbio->dev != spage->dev) { +		scrub_submit(sctx);  		goto again;  	} @@ -1542,81 +1951,87 @@ again:  			sbio->bio = NULL;  			return -EIO;  		} -		scrub_submit(sdev); +		scrub_submit(sctx);  		goto again;  	} -	scrub_block_get(sblock); /* one for the added page */ +	scrub_block_get(sblock); /* one for the page added to the bio */  	atomic_inc(&sblock->outstanding_pages);  	sbio->page_count++; -	if (sbio->page_count == sdev->pages_per_bio) -		scrub_submit(sdev); +	if (sbio->page_count == sctx->pages_per_rd_bio) +		scrub_submit(sctx);  	return 0;  } -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, -		       u64 physical, u64 flags, u64 gen, int mirror_num, -		       u8 *csum, int force) +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +		       u64 physical, struct btrfs_device *dev, u64 flags, +		       u64 gen, int mirror_num, u8 *csum, int force, +		       u64 physical_for_dev_replace)  {  	struct scrub_block *sblock;  	int index;  	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);  	if (!sblock) { -		spin_lock(&sdev->stat_lock); -		sdev->stat.malloc_errors++; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock);  		return -ENOMEM;  	} -	/* one ref inside this function, plus one for each page later on */ +	/* one ref inside this function, plus one for each page added to +	 * a bio later on */  	atomic_set(&sblock->ref_count, 1); -	sblock->sdev = sdev; +	sblock->sctx = sctx;  	sblock->no_io_error_seen = 1;  	for (index = 0; len > 0; index++) { -		struct scrub_page *spage = sblock->pagev + index; +		struct scrub_page *spage;  		u64 l = min_t(u64, len, PAGE_SIZE); -		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); -		spage->page = alloc_page(GFP_NOFS); -		if (!spage->page) { -			spin_lock(&sdev->stat_lock); -			sdev->stat.malloc_errors++; -			spin_unlock(&sdev->stat_lock); -			while (index > 0) { -				index--; -				__free_page(sblock->pagev[index].page); -			} -			kfree(sblock); +		spage = kzalloc(sizeof(*spage), GFP_NOFS); +		if (!spage) { +leave_nomem: +			spin_lock(&sctx->stat_lock); +			sctx->stat.malloc_errors++; +			spin_unlock(&sctx->stat_lock); +			scrub_block_put(sblock);  			return -ENOMEM;  		} +		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); +		scrub_page_get(spage); +		sblock->pagev[index] = spage;  		spage->sblock = sblock; -		spage->dev = sdev->dev; +		spage->dev = dev;  		spage->flags = flags;  		spage->generation = gen;  		spage->logical = logical;  		spage->physical = physical; +		spage->physical_for_dev_replace = physical_for_dev_replace;  		spage->mirror_num = mirror_num;  		if (csum) {  			spage->have_csum = 1; -			memcpy(spage->csum, csum, sdev->csum_size); +			memcpy(spage->csum, csum, sctx->csum_size);  		} else {  			spage->have_csum = 0;  		}  		sblock->page_count++; +		spage->page = alloc_page(GFP_NOFS); +		if (!spage->page) +			goto leave_nomem;  		len -= l;  		logical += l;  		physical += l; +		physical_for_dev_replace += l;  	} -	BUG_ON(sblock->page_count == 0); +	WARN_ON(sblock->page_count == 0);  	for (index = 0; index < sblock->page_count; index++) { -		struct scrub_page *spage = sblock->pagev + index; +		struct scrub_page *spage = sblock->pagev[index];  		int ret; -		ret = scrub_add_page_to_bio(sdev, spage); +		ret = scrub_add_page_to_rd_bio(sctx, spage);  		if (ret) {  			scrub_block_put(sblock);  			return ret; @@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,  	}  	if (force) -		scrub_submit(sdev); +		scrub_submit(sctx);  	/* last one frees, either here or in bio completion for last page */  	scrub_block_put(sblock); @@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,  static void scrub_bio_end_io(struct bio *bio, int err)  {  	struct scrub_bio *sbio = bio->bi_private; -	struct scrub_dev *sdev = sbio->sdev; -	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; +	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;  	sbio->err = err;  	sbio->bio = bio; @@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)  static void scrub_bio_end_io_worker(struct btrfs_work *work)  {  	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); -	struct scrub_dev *sdev = sbio->sdev; +	struct scrub_ctx *sctx = sbio->sctx;  	int i; -	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); +	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);  	if (sbio->err) {  		for (i = 0; i < sbio->page_count; i++) {  			struct scrub_page *spage = sbio->pagev[i]; @@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)  	bio_put(sbio->bio);  	sbio->bio = NULL; -	spin_lock(&sdev->list_lock); -	sbio->next_free = sdev->first_free; -	sdev->first_free = sbio->index; -	spin_unlock(&sdev->list_lock); -	atomic_dec(&sdev->in_flight); -	wake_up(&sdev->list_wait); +	spin_lock(&sctx->list_lock); +	sbio->next_free = sctx->first_free; +	sctx->first_free = sbio->index; +	spin_unlock(&sctx->list_lock); + +	if (sctx->is_dev_replace && +	    atomic_read(&sctx->wr_ctx.flush_all_writes)) { +		mutex_lock(&sctx->wr_ctx.wr_lock); +		scrub_wr_submit(sctx); +		mutex_unlock(&sctx->wr_ctx.wr_lock); +	} + +	scrub_pending_bio_dec(sctx);  }  static void scrub_block_complete(struct scrub_block *sblock)  { -	if (!sblock->no_io_error_seen) +	if (!sblock->no_io_error_seen) {  		scrub_handle_errored_block(sblock); -	else -		scrub_checksum(sblock); +	} else { +		/* +		 * if has checksum error, write via repair mechanism in +		 * dev replace case, otherwise write here in dev replace +		 * case. +		 */ +		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) +			scrub_write_block_to_dev_replace(sblock); +	}  } -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, +static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,  			   u8 *csum)  {  	struct btrfs_ordered_sum *sum = NULL; @@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  	unsigned long i;  	unsigned long num_sectors; -	while (!list_empty(&sdev->csum_list)) { -		sum = list_first_entry(&sdev->csum_list, +	while (!list_empty(&sctx->csum_list)) { +		sum = list_first_entry(&sctx->csum_list,  				       struct btrfs_ordered_sum, list);  		if (sum->bytenr > logical)  			return 0;  		if (sum->bytenr + sum->len > logical)  			break; -		++sdev->stat.csum_discards; +		++sctx->stat.csum_discards;  		list_del(&sum->list);  		kfree(sum);  		sum = NULL; @@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  	if (!sum)  		return 0; -	num_sectors = sum->len / sdev->sectorsize; +	num_sectors = sum->len / sctx->sectorsize;  	for (i = 0; i < num_sectors; ++i) {  		if (sum->sums[i].bytenr == logical) { -			memcpy(csum, &sum->sums[i].sum, sdev->csum_size); +			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);  			ret = 1;  			break;  		} @@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,  }  /* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, -			u64 physical, u64 flags, u64 gen, int mirror_num) +static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, +			u64 physical, struct btrfs_device *dev, u64 flags, +			u64 gen, int mirror_num, u64 physical_for_dev_replace)  {  	int ret;  	u8 csum[BTRFS_CSUM_SIZE];  	u32 blocksize;  	if (flags & BTRFS_EXTENT_FLAG_DATA) { -		blocksize = sdev->sectorsize; -		spin_lock(&sdev->stat_lock); -		sdev->stat.data_extents_scrubbed++; -		sdev->stat.data_bytes_scrubbed += len; -		spin_unlock(&sdev->stat_lock); +		blocksize = sctx->sectorsize; +		spin_lock(&sctx->stat_lock); +		sctx->stat.data_extents_scrubbed++; +		sctx->stat.data_bytes_scrubbed += len; +		spin_unlock(&sctx->stat_lock);  	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { -		BUG_ON(sdev->nodesize != sdev->leafsize); -		blocksize = sdev->nodesize; -		spin_lock(&sdev->stat_lock); -		sdev->stat.tree_extents_scrubbed++; -		sdev->stat.tree_bytes_scrubbed += len; -		spin_unlock(&sdev->stat_lock); +		WARN_ON(sctx->nodesize != sctx->leafsize); +		blocksize = sctx->nodesize; +		spin_lock(&sctx->stat_lock); +		sctx->stat.tree_extents_scrubbed++; +		sctx->stat.tree_bytes_scrubbed += len; +		spin_unlock(&sctx->stat_lock);  	} else { -		blocksize = sdev->sectorsize; -		BUG_ON(1); +		blocksize = sctx->sectorsize; +		WARN_ON(1);  	}  	while (len) { @@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,  		if (flags & BTRFS_EXTENT_FLAG_DATA) {  			/* push csums to sbio */ -			have_csum = scrub_find_csum(sdev, logical, l, csum); +			have_csum = scrub_find_csum(sctx, logical, l, csum);  			if (have_csum == 0) -				++sdev->stat.no_csum; +				++sctx->stat.no_csum; +			if (sctx->is_dev_replace && !have_csum) { +				ret = copy_nocow_pages(sctx, logical, l, +						       mirror_num, +						      physical_for_dev_replace); +				goto behind_scrub_pages; +			}  		} -		ret = scrub_pages(sdev, logical, l, physical, flags, gen, -				  mirror_num, have_csum ? csum : NULL, 0); +		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, +				  mirror_num, have_csum ? csum : NULL, 0, +				  physical_for_dev_replace); +behind_scrub_pages:  		if (ret)  			return ret;  		len -= l;  		logical += l;  		physical += l; +		physical_for_dev_replace += l;  	}  	return 0;  } -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, -	struct map_lookup *map, int num, u64 base, u64 length) +static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, +					   struct map_lookup *map, +					   struct btrfs_device *scrub_dev, +					   int num, u64 base, u64 length, +					   int is_dev_replace)  {  	struct btrfs_path *path; -	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;  	struct btrfs_root *root = fs_info->extent_root;  	struct btrfs_root *csum_root = fs_info->csum_root;  	struct btrfs_extent_item *extent; @@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	struct reada_control *reada2;  	struct btrfs_key key_start;  	struct btrfs_key key_end; -  	u64 increment = map->stripe_len;  	u64 offset; +	u64 extent_logical; +	u64 extent_physical; +	u64 extent_len; +	struct btrfs_device *extent_dev; +	int extent_mirror_num;  	nstripes = length;  	offset = 0; @@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  	 */  	logical = base + offset; -	wait_event(sdev->list_wait, -		   atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, +		   atomic_read(&sctx->bios_in_flight) == 0);  	atomic_inc(&fs_info->scrubs_paused);  	wake_up(&fs_info->scrub_pause_wait); @@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		 * canceled?  		 */  		if (atomic_read(&fs_info->scrub_cancel_req) || -		    atomic_read(&sdev->cancel_req)) { +		    atomic_read(&sctx->cancel_req)) {  			ret = -ECANCELED;  			goto out;  		} @@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		 */  		if (atomic_read(&fs_info->scrub_pause_req)) {  			/* push queued extents */ -			scrub_submit(sdev); -			wait_event(sdev->list_wait, -				   atomic_read(&sdev->in_flight) == 0); +			atomic_set(&sctx->wr_ctx.flush_all_writes, 1); +			scrub_submit(sctx); +			mutex_lock(&sctx->wr_ctx.wr_lock); +			scrub_wr_submit(sctx); +			mutex_unlock(&sctx->wr_ctx.wr_lock); +			wait_event(sctx->list_wait, +				   atomic_read(&sctx->bios_in_flight) == 0); +			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);  			atomic_inc(&fs_info->scrubs_paused);  			wake_up(&fs_info->scrub_pause_wait);  			mutex_lock(&fs_info->scrub_lock); @@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  		ret = btrfs_lookup_csums_range(csum_root, logical,  					       logical + map->stripe_len - 1, -					       &sdev->csum_list, 1); +					       &sctx->csum_list, 1);  		if (ret)  			goto out; @@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,  					     key.objectid;  			} -			ret = scrub_extent(sdev, key.objectid, key.offset, -					   key.objectid - logical + physical, -					   flags, generation, mirror_num); +			extent_logical = key.objectid; +			extent_physical = key.objectid - logical + physical; +			extent_len = key.offset; +			extent_dev = scrub_dev; +			extent_mirror_num = mirror_num; +			if (is_dev_replace) +				scrub_remap_extent(fs_info, extent_logical, +						   extent_len, &extent_physical, +						   &extent_dev, +						   &extent_mirror_num); +			ret = scrub_extent(sctx, extent_logical, extent_len, +					   extent_physical, extent_dev, flags, +					   generation, extent_mirror_num, +					   key.objectid - logical + physical);  			if (ret)  				goto out; @@ -2016,29 +2477,34 @@ next:  		btrfs_release_path(path);  		logical += increment;  		physical += map->stripe_len; -		spin_lock(&sdev->stat_lock); -		sdev->stat.last_physical = physical; -		spin_unlock(&sdev->stat_lock); +		spin_lock(&sctx->stat_lock); +		sctx->stat.last_physical = physical; +		spin_unlock(&sctx->stat_lock);  	} +out:  	/* push queued extents */ -	scrub_submit(sdev); +	scrub_submit(sctx); +	mutex_lock(&sctx->wr_ctx.wr_lock); +	scrub_wr_submit(sctx); +	mutex_unlock(&sctx->wr_ctx.wr_lock); -out:  	blk_finish_plug(&plug);  	btrfs_free_path(path);  	return ret < 0 ? ret : 0;  } -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, -	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, -	u64 dev_offset) +static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, +					  struct btrfs_device *scrub_dev, +					  u64 chunk_tree, u64 chunk_objectid, +					  u64 chunk_offset, u64 length, +					  u64 dev_offset, int is_dev_replace)  {  	struct btrfs_mapping_tree *map_tree = -		&sdev->dev->dev_root->fs_info->mapping_tree; +		&sctx->dev_root->fs_info->mapping_tree;  	struct map_lookup *map;  	struct extent_map *em;  	int i; -	int ret = -EINVAL; +	int ret = 0;  	read_lock(&map_tree->map_tree.lock);  	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); @@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,  		goto out;  	for (i = 0; i < map->num_stripes; ++i) { -		if (map->stripes[i].dev == sdev->dev && +		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&  		    map->stripes[i].physical == dev_offset) { -			ret = scrub_stripe(sdev, map, i, chunk_offset, length); +			ret = scrub_stripe(sctx, map, scrub_dev, i, +					   chunk_offset, length, +					   is_dev_replace);  			if (ret)  				goto out;  		} @@ -2069,11 +2537,13 @@ out:  }  static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) +int scrub_enumerate_chunks(struct scrub_ctx *sctx, +			   struct btrfs_device *scrub_dev, u64 start, u64 end, +			   int is_dev_replace)  {  	struct btrfs_dev_extent *dev_extent = NULL;  	struct btrfs_path *path; -	struct btrfs_root *root = sdev->dev->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	struct btrfs_fs_info *fs_info = root->fs_info;  	u64 length;  	u64 chunk_tree; @@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	struct btrfs_key key;  	struct btrfs_key found_key;  	struct btrfs_block_group_cache *cache; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	path = btrfs_alloc_path();  	if (!path) @@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	path->search_commit_root = 1;  	path->skip_locking = 1; -	key.objectid = sdev->dev->devid; +	key.objectid = scrub_dev->devid;  	key.offset = 0ull;  	key.type = BTRFS_DEV_EXTENT_KEY; -  	while (1) {  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);  		if (ret < 0) @@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  		btrfs_item_key_to_cpu(l, &found_key, slot); -		if (found_key.objectid != sdev->dev->devid) +		if (found_key.objectid != scrub_dev->devid)  			break;  		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) @@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  			ret = -ENOENT;  			break;  		} -		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, -				  chunk_offset, length, found_key.offset); +		dev_replace->cursor_right = found_key.offset + length; +		dev_replace->cursor_left = found_key.offset; +		dev_replace->item_needs_writeback = 1; +		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, +				  chunk_offset, length, found_key.offset, +				  is_dev_replace); + +		/* +		 * flush, submit all pending read and write bios, afterwards +		 * wait for them. +		 * Note that in the dev replace case, a read request causes +		 * write requests that are submitted in the read completion +		 * worker. Therefore in the current situation, it is required +		 * that all write requests are flushed, so that all read and +		 * write requests are really completed when bios_in_flight +		 * changes to 0. +		 */ +		atomic_set(&sctx->wr_ctx.flush_all_writes, 1); +		scrub_submit(sctx); +		mutex_lock(&sctx->wr_ctx.wr_lock); +		scrub_wr_submit(sctx); +		mutex_unlock(&sctx->wr_ctx.wr_lock); + +		wait_event(sctx->list_wait, +			   atomic_read(&sctx->bios_in_flight) == 0); +		atomic_set(&sctx->wr_ctx.flush_all_writes, 0); +		atomic_inc(&fs_info->scrubs_paused); +		wake_up(&fs_info->scrub_pause_wait); +		wait_event(sctx->list_wait, +			   atomic_read(&sctx->workers_pending) == 0); + +		mutex_lock(&fs_info->scrub_lock); +		while (atomic_read(&fs_info->scrub_pause_req)) { +			mutex_unlock(&fs_info->scrub_lock); +			wait_event(fs_info->scrub_pause_wait, +			   atomic_read(&fs_info->scrub_pause_req) == 0); +			mutex_lock(&fs_info->scrub_lock); +		} +		atomic_dec(&fs_info->scrubs_paused); +		mutex_unlock(&fs_info->scrub_lock); +		wake_up(&fs_info->scrub_pause_wait); + +		dev_replace->cursor_left = dev_replace->cursor_right; +		dev_replace->item_needs_writeback = 1;  		btrfs_put_block_group(cache);  		if (ret)  			break; +		if (is_dev_replace && +		    atomic64_read(&dev_replace->num_write_errors) > 0) { +			ret = -EIO; +			break; +		} +		if (sctx->stat.malloc_errors > 0) { +			ret = -ENOMEM; +			break; +		}  		key.offset = found_key.offset + length;  		btrfs_release_path(path); @@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  	return ret < 0 ? ret : 0;  } -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) +static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, +					   struct btrfs_device *scrub_dev)  {  	int	i;  	u64	bytenr;  	u64	gen;  	int	ret; -	struct btrfs_device *device = sdev->dev; -	struct btrfs_root *root = device->dev_root; +	struct btrfs_root *root = sctx->dev_root;  	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)  		return -EIO; @@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {  		bytenr = btrfs_sb_offset(i); -		if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) +		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)  			break; -		ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, -				     BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); +		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, +				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, +				  NULL, 1, bytenr);  		if (ret)  			return ret;  	} -	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);  	return 0;  } @@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)  /*   * get a reference count on fs_info->scrub_workers. start worker if necessary   */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, +						int is_dev_replace)  { -	struct btrfs_fs_info *fs_info = root->fs_info;  	int ret = 0;  	mutex_lock(&fs_info->scrub_lock);  	if (fs_info->scrub_workers_refcnt == 0) { -		btrfs_init_workers(&fs_info->scrub_workers, "scrub", -			   fs_info->thread_pool_size, &fs_info->generic_worker); +		if (is_dev_replace) +			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, +					&fs_info->generic_worker); +		else +			btrfs_init_workers(&fs_info->scrub_workers, "scrub", +					fs_info->thread_pool_size, +					&fs_info->generic_worker);  		fs_info->scrub_workers.idle_thresh = 4;  		ret = btrfs_start_workers(&fs_info->scrub_workers);  		if (ret)  			goto out; +		btrfs_init_workers(&fs_info->scrub_wr_completion_workers, +				   "scrubwrc", +				   fs_info->thread_pool_size, +				   &fs_info->generic_worker); +		fs_info->scrub_wr_completion_workers.idle_thresh = 2; +		ret = btrfs_start_workers( +				&fs_info->scrub_wr_completion_workers); +		if (ret) +			goto out; +		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, +				   &fs_info->generic_worker); +		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); +		if (ret) +			goto out;  	}  	++fs_info->scrub_workers_refcnt;  out: @@ -2223,40 +2764,41 @@ out:  	return ret;  } -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) +static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)  { -	struct btrfs_fs_info *fs_info = root->fs_info; -  	mutex_lock(&fs_info->scrub_lock); -	if (--fs_info->scrub_workers_refcnt == 0) +	if (--fs_info->scrub_workers_refcnt == 0) {  		btrfs_stop_workers(&fs_info->scrub_workers); +		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); +		btrfs_stop_workers(&fs_info->scrub_nocow_workers); +	}  	WARN_ON(fs_info->scrub_workers_refcnt < 0);  	mutex_unlock(&fs_info->scrub_lock);  } - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, -		    struct btrfs_scrub_progress *progress, int readonly) +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, +		    u64 end, struct btrfs_scrub_progress *progress, +		    int readonly, int is_dev_replace)  { -	struct scrub_dev *sdev; -	struct btrfs_fs_info *fs_info = root->fs_info; +	struct scrub_ctx *sctx;  	int ret;  	struct btrfs_device *dev; -	if (btrfs_fs_closing(root->fs_info)) +	if (btrfs_fs_closing(fs_info))  		return -EINVAL;  	/*  	 * check some assumptions  	 */ -	if (root->nodesize != root->leafsize) { +	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {  		printk(KERN_ERR  		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", -		       root->nodesize, root->leafsize); +		       fs_info->chunk_root->nodesize, +		       fs_info->chunk_root->leafsize);  		return -EINVAL;  	} -	if (root->nodesize > BTRFS_STRIPE_LEN) { +	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {  		/*  		 * in this case scrub is unable to calculate the checksum  		 * the way scrub is implemented. Do not handle this @@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,  		 */  		printk(KERN_ERR  		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", -		       root->nodesize, BTRFS_STRIPE_LEN); +		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);  		return -EINVAL;  	} -	if (root->sectorsize != PAGE_SIZE) { +	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {  		/* not supported for data w/o checksums */  		printk(KERN_ERR  		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", -		       root->sectorsize, (unsigned long long)PAGE_SIZE); +		       fs_info->chunk_root->sectorsize, +		       (unsigned long long)PAGE_SIZE);  		return -EINVAL;  	} -	ret = scrub_workers_get(root); +	if (fs_info->chunk_root->nodesize > +	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || +	    fs_info->chunk_root->sectorsize > +	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { +		/* +		 * would exhaust the array bounds of pagev member in +		 * struct scrub_block +		 */ +		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", +		       fs_info->chunk_root->nodesize, +		       SCRUB_MAX_PAGES_PER_BLOCK, +		       fs_info->chunk_root->sectorsize, +		       SCRUB_MAX_PAGES_PER_BLOCK); +		return -EINVAL; +	} + +	ret = scrub_workers_get(fs_info, is_dev_replace);  	if (ret)  		return ret; -	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); -	if (!dev || dev->missing) { -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); +	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	dev = btrfs_find_device(fs_info, devid, NULL, NULL); +	if (!dev || (dev->missing && !is_dev_replace)) { +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info);  		return -ENODEV;  	}  	mutex_lock(&fs_info->scrub_lock); -	if (!dev->in_fs_metadata) { +	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); -		return -ENODEV; +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info); +		return -EIO;  	} -	if (dev->scrub_device) { +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (dev->scrub_device || +	    (!is_dev_replace && +	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { +		btrfs_dev_replace_unlock(&fs_info->dev_replace);  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info);  		return -EINPROGRESS;  	} -	sdev = scrub_setup_dev(dev); -	if (IS_ERR(sdev)) { +	btrfs_dev_replace_unlock(&fs_info->dev_replace); +	sctx = scrub_setup_ctx(dev, is_dev_replace); +	if (IS_ERR(sctx)) {  		mutex_unlock(&fs_info->scrub_lock); -		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -		scrub_workers_put(root); -		return PTR_ERR(sdev); +		mutex_unlock(&fs_info->fs_devices->device_list_mutex); +		scrub_workers_put(fs_info); +		return PTR_ERR(sctx);  	} -	sdev->readonly = readonly; -	dev->scrub_device = sdev; +	sctx->readonly = readonly; +	dev->scrub_device = sctx;  	atomic_inc(&fs_info->scrubs_running);  	mutex_unlock(&fs_info->scrub_lock); -	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); -	down_read(&fs_info->scrub_super_lock); -	ret = scrub_supers(sdev); -	up_read(&fs_info->scrub_super_lock); +	if (!is_dev_replace) { +		down_read(&fs_info->scrub_super_lock); +		ret = scrub_supers(sctx, dev); +		up_read(&fs_info->scrub_super_lock); +	}  	if (!ret) -		ret = scrub_enumerate_chunks(sdev, start, end); +		ret = scrub_enumerate_chunks(sctx, dev, start, end, +					     is_dev_replace); -	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);  	atomic_dec(&fs_info->scrubs_running);  	wake_up(&fs_info->scrub_pause_wait); -	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); +	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);  	if (progress) -		memcpy(progress, &sdev->stat, sizeof(*progress)); +		memcpy(progress, &sctx->stat, sizeof(*progress));  	mutex_lock(&fs_info->scrub_lock);  	dev->scrub_device = NULL;  	mutex_unlock(&fs_info->scrub_lock); -	scrub_free_dev(sdev); -	scrub_workers_put(root); +	scrub_free_ctx(sctx); +	scrub_workers_put(fs_info);  	return ret;  } @@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)  	up_write(&root->fs_info->scrub_super_lock);  } -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) +int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)  { -  	mutex_lock(&fs_info->scrub_lock);  	if (!atomic_read(&fs_info->scrubs_running)) {  		mutex_unlock(&fs_info->scrub_lock); @@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)  	return 0;  } -int btrfs_scrub_cancel(struct btrfs_root *root) +int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, +			   struct btrfs_device *dev)  { -	return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ -	struct btrfs_fs_info *fs_info = root->fs_info; -	struct scrub_dev *sdev; +	struct scrub_ctx *sctx;  	mutex_lock(&fs_info->scrub_lock); -	sdev = dev->scrub_device; -	if (!sdev) { +	sctx = dev->scrub_device; +	if (!sctx) {  		mutex_unlock(&fs_info->scrub_lock);  		return -ENOTCONN;  	} -	atomic_inc(&sdev->cancel_req); +	atomic_inc(&sctx->cancel_req);  	while (dev->scrub_device) {  		mutex_unlock(&fs_info->scrub_lock);  		wait_event(fs_info->scrub_pause_wait, @@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)  	 * does not go away in cancel_dev. FIXME: find a better solution  	 */  	mutex_lock(&fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); +	dev = btrfs_find_device(fs_info, devid, NULL, NULL);  	if (!dev) {  		mutex_unlock(&fs_info->fs_devices->device_list_mutex);  		return -ENODEV;  	} -	ret = btrfs_scrub_cancel_dev(root, dev); +	ret = btrfs_scrub_cancel_dev(fs_info, dev);  	mutex_unlock(&fs_info->fs_devices->device_list_mutex);  	return ret; @@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,  			 struct btrfs_scrub_progress *progress)  {  	struct btrfs_device *dev; -	struct scrub_dev *sdev = NULL; +	struct scrub_ctx *sctx = NULL;  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, devid, NULL, NULL); +	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);  	if (dev) -		sdev = dev->scrub_device; -	if (sdev) -		memcpy(progress, &sdev->stat, sizeof(*progress)); +		sctx = dev->scrub_device; +	if (sctx) +		memcpy(progress, &sctx->stat, sizeof(*progress));  	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; +	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; +} + +static void scrub_remap_extent(struct btrfs_fs_info *fs_info, +			       u64 extent_logical, u64 extent_len, +			       u64 *extent_physical, +			       struct btrfs_device **extent_dev, +			       int *extent_mirror_num) +{ +	u64 mapped_length; +	struct btrfs_bio *bbio = NULL; +	int ret; + +	mapped_length = extent_len; +	ret = btrfs_map_block(fs_info, READ, extent_logical, +			      &mapped_length, &bbio, 0); +	if (ret || !bbio || mapped_length < extent_len || +	    !bbio->stripes[0].dev->bdev) { +		kfree(bbio); +		return; +	} + +	*extent_physical = bbio->stripes[0].physical; +	*extent_mirror_num = bbio->mirror_num; +	*extent_dev = bbio->stripes[0].dev; +	kfree(bbio); +} + +static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, +			      struct scrub_wr_ctx *wr_ctx, +			      struct btrfs_fs_info *fs_info, +			      struct btrfs_device *dev, +			      int is_dev_replace) +{ +	WARN_ON(wr_ctx->wr_curr_bio != NULL); + +	mutex_init(&wr_ctx->wr_lock); +	wr_ctx->wr_curr_bio = NULL; +	if (!is_dev_replace) +		return 0; + +	WARN_ON(!dev->bdev); +	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, +					 bio_get_nr_vecs(dev->bdev)); +	wr_ctx->tgtdev = dev; +	atomic_set(&wr_ctx->flush_all_writes, 0); +	return 0; +} + +static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) +{ +	mutex_lock(&wr_ctx->wr_lock); +	kfree(wr_ctx->wr_curr_bio); +	wr_ctx->wr_curr_bio = NULL; +	mutex_unlock(&wr_ctx->wr_lock); +} + +static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +			    int mirror_num, u64 physical_for_dev_replace) +{ +	struct scrub_copy_nocow_ctx *nocow_ctx; +	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + +	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); +	if (!nocow_ctx) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		return -ENOMEM; +	} + +	scrub_pending_trans_workers_inc(sctx); + +	nocow_ctx->sctx = sctx; +	nocow_ctx->logical = logical; +	nocow_ctx->len = len; +	nocow_ctx->mirror_num = mirror_num; +	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; +	nocow_ctx->work.func = copy_nocow_pages_worker; +	btrfs_queue_worker(&fs_info->scrub_nocow_workers, +			   &nocow_ctx->work); + +	return 0; +} + +static void copy_nocow_pages_worker(struct btrfs_work *work) +{ +	struct scrub_copy_nocow_ctx *nocow_ctx = +		container_of(work, struct scrub_copy_nocow_ctx, work); +	struct scrub_ctx *sctx = nocow_ctx->sctx; +	u64 logical = nocow_ctx->logical; +	u64 len = nocow_ctx->len; +	int mirror_num = nocow_ctx->mirror_num; +	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; +	int ret; +	struct btrfs_trans_handle *trans = NULL; +	struct btrfs_fs_info *fs_info; +	struct btrfs_path *path; +	struct btrfs_root *root; +	int not_written = 0; + +	fs_info = sctx->dev_root->fs_info; +	root = fs_info->extent_root; + +	path = btrfs_alloc_path(); +	if (!path) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		not_written = 1; +		goto out; +	} + +	trans = btrfs_join_transaction(root); +	if (IS_ERR(trans)) { +		not_written = 1; +		goto out; +	} + +	ret = iterate_inodes_from_logical(logical, fs_info, path, +					  copy_nocow_pages_for_inode, +					  nocow_ctx); +	if (ret != 0 && ret != -ENOENT) { +		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", +			(unsigned long long)logical, +			(unsigned long long)physical_for_dev_replace, +			(unsigned long long)len, +			(unsigned long long)mirror_num, ret); +		not_written = 1; +		goto out; +	} + +out: +	if (trans && !IS_ERR(trans)) +		btrfs_end_transaction(trans, root); +	if (not_written) +		btrfs_dev_replace_stats_inc(&fs_info->dev_replace. +					    num_uncorrectable_read_errors); + +	btrfs_free_path(path); +	kfree(nocow_ctx); + +	scrub_pending_trans_workers_dec(sctx); +} + +static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ +	unsigned long index; +	struct scrub_copy_nocow_ctx *nocow_ctx = ctx; +	int ret = 0; +	struct btrfs_key key; +	struct inode *inode = NULL; +	struct btrfs_root *local_root; +	u64 physical_for_dev_replace; +	u64 len; +	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + +	key.objectid = root; +	key.type = BTRFS_ROOT_ITEM_KEY; +	key.offset = (u64)-1; +	local_root = btrfs_read_fs_root_no_name(fs_info, &key); +	if (IS_ERR(local_root)) +		return PTR_ERR(local_root); + +	key.type = BTRFS_INODE_ITEM_KEY; +	key.objectid = inum; +	key.offset = 0; +	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); +	if (IS_ERR(inode)) +		return PTR_ERR(inode); + +	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; +	len = nocow_ctx->len; +	while (len >= PAGE_CACHE_SIZE) { +		struct page *page = NULL; +		int ret_sub; + +		index = offset >> PAGE_CACHE_SHIFT; + +		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); +		if (!page) { +			pr_err("find_or_create_page() failed\n"); +			ret = -ENOMEM; +			goto next_page; +		} + +		if (PageUptodate(page)) { +			if (PageDirty(page)) +				goto next_page; +		} else { +			ClearPageError(page); +			ret_sub = extent_read_full_page(&BTRFS_I(inode)-> +							 io_tree, +							page, btrfs_get_extent, +							nocow_ctx->mirror_num); +			if (ret_sub) { +				ret = ret_sub; +				goto next_page; +			} +			wait_on_page_locked(page); +			if (!PageUptodate(page)) { +				ret = -EIO; +				goto next_page; +			} +		} +		ret_sub = write_page_nocow(nocow_ctx->sctx, +					   physical_for_dev_replace, page); +		if (ret_sub) { +			ret = ret_sub; +			goto next_page; +		} + +next_page: +		if (page) { +			unlock_page(page); +			put_page(page); +		} +		offset += PAGE_CACHE_SIZE; +		physical_for_dev_replace += PAGE_CACHE_SIZE; +		len -= PAGE_CACHE_SIZE; +	} + +	if (inode) +		iput(inode); +	return ret; +} + +static int write_page_nocow(struct scrub_ctx *sctx, +			    u64 physical_for_dev_replace, struct page *page) +{ +	struct bio *bio; +	struct btrfs_device *dev; +	int ret; +	DECLARE_COMPLETION_ONSTACK(compl); + +	dev = sctx->wr_ctx.tgtdev; +	if (!dev) +		return -EIO; +	if (!dev->bdev) { +		printk_ratelimited(KERN_WARNING +			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); +		return -EIO; +	} +	bio = bio_alloc(GFP_NOFS, 1); +	if (!bio) { +		spin_lock(&sctx->stat_lock); +		sctx->stat.malloc_errors++; +		spin_unlock(&sctx->stat_lock); +		return -ENOMEM; +	} +	bio->bi_private = &compl; +	bio->bi_end_io = scrub_complete_bio_end_io; +	bio->bi_size = 0; +	bio->bi_sector = physical_for_dev_replace >> 9; +	bio->bi_bdev = dev->bdev; +	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); +	if (ret != PAGE_CACHE_SIZE) { +leave_with_eio: +		bio_put(bio); +		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); +		return -EIO; +	} +	btrfsic_submit_bio(WRITE_SYNC, bio); +	wait_for_completion(&compl); + +	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) +		goto leave_with_eio; + +	bio_put(bio); +	return 0;  } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e78b297b0b00..54454542ad40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)  	if (!path)  		return -ENOMEM; -	spin_lock(&send_root->root_times_lock); +	spin_lock(&send_root->root_item_lock);  	start_ctransid = btrfs_root_ctransid(&send_root->root_item); -	spin_unlock(&send_root->root_times_lock); +	spin_unlock(&send_root->root_item_lock);  	key.objectid = BTRFS_FIRST_FREE_OBJECTID;  	key.type = BTRFS_INODE_ITEM_KEY; @@ -4422,9 +4422,9 @@ join_trans:  	 * Make sure the tree has not changed after re-joining. We detect this  	 * by comparing start_ctransid and ctransid. They should always match.  	 */ -	spin_lock(&send_root->root_times_lock); +	spin_lock(&send_root->root_item_lock);  	ctransid = btrfs_root_ctransid(&send_root->root_item); -	spin_unlock(&send_root->root_times_lock); +	spin_unlock(&send_root->root_item_lock);  	if (ctransid != start_ctransid) {  		WARN(1, KERN_WARNING "btrfs: the root that you're trying to " diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14c2064..99545df1b86c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@  #include "export.h"  #include "compression.h"  #include "rcu-string.h" +#include "dev-replace.h"  #define CREATE_TRACE_POINTS  #include <trace/events/btrfs.h> @@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)  	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {  		sb->s_flags |= MS_RDONLY;  		printk(KERN_INFO "btrfs is forced readonly\n"); -		__btrfs_scrub_cancel(fs_info); +		/* +		 * Note that a running device replace operation is not +		 * canceled here although there is no way to update +		 * the progress. It would add the risk of a deadlock, +		 * therefore the canceling is ommited. The only penalty +		 * is that some I/O remains active until the procedure +		 * completes. The next time when the filesystem is +		 * mounted writeable again, the device replace +		 * operation continues. +		 */  //		WARN_ON(1);  	}  } @@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,  	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);  	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);  	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); -	btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); +	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, +			      new_pool_size);  }  static int btrfs_remount(struct super_block *sb, int *flags, char *data) @@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		return 0;  	if (*flags & MS_RDONLY) { +		/* +		 * this also happens on 'umount -rf' or on shutdown, when +		 * the filesystem is busy. +		 */  		sb->s_flags |= MS_RDONLY; +		btrfs_dev_replace_suspend_for_unmount(fs_info); +		btrfs_scrub_cancel(fs_info); +  		ret = btrfs_commit_super(root);  		if (ret)  			goto restore; @@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  			goto restore;  		} +		if (fs_info->fs_devices->missing_devices > +		     fs_info->num_tolerated_disk_barrier_failures && +		    !(*flags & MS_RDONLY)) { +			printk(KERN_WARNING +			       "Btrfs: too many missing devices, writeable remount is not allowed\n"); +			ret = -EACCES; +			goto restore; +		} +  		if (btrfs_super_log_root(fs_info->super_copy) != 0) {  			ret = -EINVAL;  			goto restore; @@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)  		if (ret)  			goto restore; +		ret = btrfs_resume_dev_replace_async(fs_info); +		if (ret) { +			pr_warn("btrfs: failed to resume dev_replace\n"); +			goto restore; +		}  		sb->s_flags &= ~MS_RDONLY;  	} @@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)  		min_stripe_size = BTRFS_STRIPE_LEN;  	list_for_each_entry(device, &fs_devices->devices, dev_list) { -		if (!device->in_fs_metadata || !device->bdev) +		if (!device->in_fs_metadata || !device->bdev || +		    device->is_tgtdev_for_dev_replace)  			continue;  		avail_space = device->total_bytes - device->bytes_used; @@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)  	if (err)  		goto free_ordered_data; -	err = btrfs_interface_init(); +	err = btrfs_auto_defrag_init();  	if (err)  		goto free_delayed_inode; +	err = btrfs_interface_init(); +	if (err) +		goto free_auto_defrag; +  	err = register_filesystem(&btrfs_fs_type);  	if (err)  		goto unregister_ioctl; @@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)  unregister_ioctl:  	btrfs_interface_exit(); +free_auto_defrag: +	btrfs_auto_defrag_exit();  free_delayed_inode:  	btrfs_delayed_inode_exit();  free_ordered_data: @@ -1681,6 +1720,7 @@ free_compress:  static void __exit exit_btrfs_fs(void)  {  	btrfs_destroy_cachep(); +	btrfs_auto_defrag_exit();  	btrfs_delayed_inode_exit();  	ordered_data_exit();  	extent_map_exit(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 04bbfb1052eb..87fac9a21ea5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@  #include "tree-log.h"  #include "inode-map.h"  #include "volumes.h" +#include "dev-replace.h"  #define BTRFS_ROOT_TRANS_TAG 0 @@ -145,16 +146,12 @@ loop:  	 * the log must never go across transaction boundaries.  	 */  	smp_mb(); -	if (!list_empty(&fs_info->tree_mod_seq_list)) { -		printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " +	if (!list_empty(&fs_info->tree_mod_seq_list)) +		WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "  			"creating a fresh transaction\n"); -		WARN_ON(1); -	} -	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { -		printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " +	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) +		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "  			"creating a fresh transaction\n"); -		WARN_ON(1); -	}  	atomic_set(&fs_info->tree_mod_seq, 0);  	spin_lock_init(&cur_trans->commit_lock); @@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)  	return 0;  } -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, -						    u64 num_items, int type, -						    int noflush) +static struct btrfs_trans_handle * +start_transaction(struct btrfs_root *root, u64 num_items, int type, +		  enum btrfs_reserve_flush_enum flush)  {  	struct btrfs_trans_handle *h;  	struct btrfs_transaction *cur_trans; @@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);  		h = current->journal_info;  		h->use_count++; +		WARN_ON(h->use_count > 2);  		h->orig_rsv = h->block_rsv;  		h->block_rsv = NULL;  		goto got_it; @@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,  		}  		num_bytes = btrfs_calc_trans_metadata_size(root, num_items); -		if (noflush) -			ret = btrfs_block_rsv_add_noflush(root, -						&root->fs_info->trans_block_rsv, -						num_bytes); -		else -			ret = btrfs_block_rsv_add(root, -						&root->fs_info->trans_block_rsv, -						num_bytes); +		ret = btrfs_block_rsv_add(root, +					  &root->fs_info->trans_block_rsv, +					  num_bytes, flush);  		if (ret)  			return ERR_PTR(ret);  	} @@ -422,13 +415,15 @@ got_it:  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,  						   int num_items)  { -	return start_transaction(root, num_items, TRANS_START, 0); +	return start_transaction(root, num_items, TRANS_START, +				 BTRFS_RESERVE_FLUSH_ALL);  } -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush(  					struct btrfs_root *root, int num_items)  { -	return start_transaction(root, num_items, TRANS_START, 1); +	return start_transaction(root, num_items, TRANS_START, +				 BTRFS_RESERVE_FLUSH_LIMIT);  }  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) @@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,  int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)  {  	struct btrfs_transaction *cur_trans = NULL, *t; -	int ret; +	int ret = 0; -	ret = 0;  	if (transid) {  		if (transid <= root->fs_info->last_trans_committed)  			goto out; +		ret = -EINVAL;  		/* find specified transaction */  		spin_lock(&root->fs_info->trans_lock);  		list_for_each_entry(t, &root->fs_info->trans_list, list) {  			if (t->transid == transid) {  				cur_trans = t;  				atomic_inc(&cur_trans->use_count); +				ret = 0;  				break;  			} -			if (t->transid > transid) +			if (t->transid > transid) { +				ret = 0;  				break; +			}  		}  		spin_unlock(&root->fs_info->trans_lock); -		ret = -EINVAL; +		/* The specified transaction doesn't exist */  		if (!cur_trans) -			goto out;  /* bad transid */ +			goto out;  	} else {  		/* find newest transaction that is committing | committed */  		spin_lock(&root->fs_info->trans_lock); @@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)  	}  	wait_for_commit(root, cur_trans); -  	put_transaction(cur_trans); -	ret = 0;  out:  	return ret;  } @@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,  		return ret;  	ret = btrfs_run_dev_stats(trans, root->fs_info); -	BUG_ON(ret); +	WARN_ON(ret); +	ret = btrfs_run_dev_replace(trans, root->fs_info); +	WARN_ON(ret);  	ret = btrfs_run_qgroups(trans, root->fs_info);  	BUG_ON(ret); @@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,  	switch_commit_root(fs_info->extent_root);  	up_write(&fs_info->extent_commit_sem); +	btrfs_after_dev_replace_commit(fs_info); +  	return 0;  } @@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  	struct btrfs_fs_info *info = root->fs_info;  	struct btrfs_trans_handle *trans;  	int ret; -	unsigned long nr;  	if (xchg(&root->defrag_running, 1))  		return 0; @@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)  		ret = btrfs_defrag_leaves(trans, root, cacheonly); -		nr = trans->blocks_used;  		btrfs_end_transaction(trans, root); -		btrfs_btree_balance_dirty(info->tree_root, nr); +		btrfs_btree_balance_dirty(info->tree_root);  		cond_resched();  		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) @@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);  	if (to_reserve > 0) { -		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, -						  to_reserve); +		ret = btrfs_block_rsv_add(root, &pending->block_rsv, +					  to_reserve, +					  BTRFS_RESERVE_NO_FLUSH);  		if (ret) {  			pending->error = ret;  			goto no_free_objectid; @@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  				    parent_inode, &key,  				    BTRFS_FT_DIR, index);  	/* We have check then name at the beginning, so it is impossible. */ -	BUG_ON(ret == -EEXIST); +	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);  	if (ret) {  		btrfs_abort_transaction(trans, root, ret);  		goto fail; @@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)  	 * We've got freeze protection passed with the transaction.  	 * Tell lockdep about it.  	 */ -	rwsem_acquire_read( -		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], -		0, 1, _THIS_IP_); +	if (ac->newtrans->type < TRANS_JOIN_NOLOCK) +		rwsem_acquire_read( +		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], +		     0, 1, _THIS_IP_);  	current->journal_info = ac->newtrans; @@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,  	 * Tell lockdep we've released the freeze rwsem, since the  	 * async commit thread will be the one to unlock it.  	 */ -	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], -		      1, _THIS_IP_); +	if (trans->type < TRANS_JOIN_NOLOCK) +		rwsem_release( +			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], +			1, _THIS_IP_);  	schedule_delayed_work(&ac->work, 0); @@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,  	kmem_cache_free(btrfs_trans_handle_cachep, trans);  } +static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, +					  struct btrfs_root *root) +{ +	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); +	int snap_pending = 0; +	int ret; + +	if (!flush_on_commit) { +		spin_lock(&root->fs_info->trans_lock); +		if (!list_empty(&trans->transaction->pending_snapshots)) +			snap_pending = 1; +		spin_unlock(&root->fs_info->trans_lock); +	} + +	if (flush_on_commit || snap_pending) { +		btrfs_start_delalloc_inodes(root, 1); +		btrfs_wait_ordered_extents(root, 1); +	} + +	ret = btrfs_run_delayed_items(trans, root); +	if (ret) +		return ret; + +	/* +	 * running the delayed items may have added new refs. account +	 * them now so that they hinder processing of more delayed refs +	 * as little as possible. +	 */ +	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + +	/* +	 * rename don't use btrfs_join_transaction, so, once we +	 * set the transaction to blocked above, we aren't going +	 * to get any new ordered operations.  We can safely run +	 * it here and no for sure that nothing new will be added +	 * to the list +	 */ +	btrfs_run_ordered_operations(root, 1); + +	return 0; +} +  /*   * btrfs_transaction state sequence:   *    in_commit = 0, blocked = 0  (initial) @@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	struct btrfs_transaction *cur_trans = trans->transaction;  	struct btrfs_transaction *prev_trans = NULL;  	DEFINE_WAIT(wait); -	int ret = -EIO; +	int ret;  	int should_grow = 0;  	unsigned long now = get_seconds(); -	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); -	btrfs_run_ordered_operations(root, 0); +	ret = btrfs_run_ordered_operations(root, 0); +	if (ret) { +		btrfs_abort_transaction(trans, root, ret); +		goto cleanup_transaction; +	} -	if (cur_trans->aborted) +	if (cur_trans->aborted) { +		ret = cur_trans->aborted;  		goto cleanup_transaction; +	}  	/* make a pass through all the delayed refs we have so far  	 * any runnings procs may add more while we are here @@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  		should_grow = 1;  	do { -		int snap_pending = 0; -  		joined = cur_trans->num_joined; -		if (!list_empty(&trans->transaction->pending_snapshots)) -			snap_pending = 1;  		WARN_ON(cur_trans != trans->transaction); -		if (flush_on_commit || snap_pending) { -			btrfs_start_delalloc_inodes(root, 1); -			btrfs_wait_ordered_extents(root, 1); -		} - -		ret = btrfs_run_delayed_items(trans, root); +		ret = btrfs_flush_all_pending_stuffs(trans, root);  		if (ret)  			goto cleanup_transaction; -		/* -		 * running the delayed items may have added new refs. account -		 * them now so that they hinder processing of more delayed refs -		 * as little as possible. -		 */ -		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - -		/* -		 * rename don't use btrfs_join_transaction, so, once we -		 * set the transaction to blocked above, we aren't going -		 * to get any new ordered operations.  We can safely run -		 * it here and no for sure that nothing new will be added -		 * to the list -		 */ -		btrfs_run_ordered_operations(root, 1); -  		prepare_to_wait(&cur_trans->writer_wait, &wait,  				TASK_UNINTERRUPTIBLE); @@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	} while (atomic_read(&cur_trans->num_writers) > 1 ||  		 (should_grow && cur_trans->num_joined != joined)); +	ret = btrfs_flush_all_pending_stuffs(trans, root); +	if (ret) +		goto cleanup_transaction; +  	/*  	 * Ok now we need to make sure to block out any other joins while we  	 * commit the transaction.  We could have started a join before setting diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 80961947a6b2..0e8aa1e6c287 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,  			  struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,  						   int num_items); -struct btrfs_trans_handle *btrfs_start_transaction_noflush( +struct btrfs_trans_handle *btrfs_start_transaction_lflush(  					struct btrfs_root *root, int num_items);  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 81e407d9677a..83186c7e45d4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  			    struct btrfs_inode_item *item,  			    struct inode *inode, int log_inode_only)  { -	btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); -	btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); -	btrfs_set_inode_mode(leaf, item, inode->i_mode); -	btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), -			       inode->i_atime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), -				inode->i_atime.tv_nsec); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), -			       inode->i_mtime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), -				inode->i_mtime.tv_nsec); - -	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), -			       inode->i_ctime.tv_sec); -	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), -				inode->i_ctime.tv_nsec); - -	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - -	btrfs_set_inode_sequence(leaf, item, inode->i_version); -	btrfs_set_inode_transid(leaf, item, trans->transid); -	btrfs_set_inode_rdev(leaf, item, inode->i_rdev); -	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); -	btrfs_set_inode_block_group(leaf, item, 0); +	struct btrfs_map_token token; + +	btrfs_init_map_token(&token);  	if (log_inode_only) {  		/* set the generation to zero so the recover code @@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,  		 * just to say 'this inode exists' and a logging  		 * to say 'update this inode with these values'  		 */ -		btrfs_set_inode_generation(leaf, item, 0); -		btrfs_set_inode_size(leaf, item, 0); +		btrfs_set_token_inode_generation(leaf, item, 0, &token); +		btrfs_set_token_inode_size(leaf, item, 0, &token);  	} else { -		btrfs_set_inode_generation(leaf, item, -					   BTRFS_I(inode)->generation); -		btrfs_set_inode_size(leaf, item, inode->i_size); -	} +		btrfs_set_token_inode_generation(leaf, item, +						 BTRFS_I(inode)->generation, +						 &token); +		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); +	} + +	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); +	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); +	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); +	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), +				     inode->i_atime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), +				      inode->i_atime.tv_nsec, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), +				     inode->i_mtime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), +				      inode->i_mtime.tv_nsec, &token); + +	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), +				     inode->i_ctime.tv_sec, &token); +	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), +				      inode->i_ctime.tv_nsec, &token); + +	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), +				     &token); + +	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); +	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); +	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); +	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); +	btrfs_set_token_inode_block_group(leaf, item, 0, &token); +} +static int log_inode_item(struct btrfs_trans_handle *trans, +			  struct btrfs_root *log, struct btrfs_path *path, +			  struct inode *inode) +{ +	struct btrfs_inode_item *inode_item; +	struct btrfs_key key; +	int ret; + +	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); +	ret = btrfs_insert_empty_item(trans, log, path, &key, +				      sizeof(*inode_item)); +	if (ret && ret != -EEXIST) +		return ret; +	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], +				    struct btrfs_inode_item); +	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); +	btrfs_release_path(path); +	return 0;  }  static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)  	return 0;  } -struct log_args { -	struct extent_buffer *src; -	u64 next_offset; -	int start_slot; -	int nr; -}; +static int drop_adjacent_extents(struct btrfs_trans_handle *trans, +				 struct btrfs_root *root, struct inode *inode, +				 struct extent_map *em, +				 struct btrfs_path *path) +{ +	struct btrfs_file_extent_item *fi; +	struct extent_buffer *leaf; +	struct btrfs_key key, new_key; +	struct btrfs_map_token token; +	u64 extent_end; +	u64 extent_offset = 0; +	int extent_type; +	int del_slot = 0; +	int del_nr = 0; +	int ret = 0; + +	while (1) { +		btrfs_init_map_token(&token); +		leaf = path->nodes[0]; +		path->slots[0]++; +		if (path->slots[0] >= btrfs_header_nritems(leaf)) { +			if (del_nr) { +				ret = btrfs_del_items(trans, root, path, +						      del_slot, del_nr); +				if (ret) +					return ret; +				del_nr = 0; +			} + +			ret = btrfs_next_leaf_write(trans, root, path, 1); +			if (ret < 0) +				return ret; +			if (ret > 0) +				return 0; +			leaf = path->nodes[0]; +		} + +		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +		if (key.objectid != btrfs_ino(inode) || +		    key.type != BTRFS_EXTENT_DATA_KEY || +		    key.offset >= em->start + em->len) +			break; + +		fi = btrfs_item_ptr(leaf, path->slots[0], +				    struct btrfs_file_extent_item); +		extent_type = btrfs_token_file_extent_type(leaf, fi, &token); +		if (extent_type == BTRFS_FILE_EXTENT_REG || +		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) { +			extent_offset = btrfs_token_file_extent_offset(leaf, +								fi, &token); +			extent_end = key.offset + +				btrfs_token_file_extent_num_bytes(leaf, fi, +								  &token); +		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { +			extent_end = key.offset + +				btrfs_file_extent_inline_len(leaf, fi); +		} else { +			BUG(); +		} + +		if (extent_end <= em->len + em->start) { +			if (!del_nr) { +				del_slot = path->slots[0]; +			} +			del_nr++; +			continue; +		} + +		/* +		 * Ok so we'll ignore previous items if we log a new extent, +		 * which can lead to overlapping extents, so if we have an +		 * existing extent we want to adjust we _have_ to check the next +		 * guy to make sure we even need this extent anymore, this keeps +		 * us from panicing in set_item_key_safe. +		 */ +		if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { +			struct btrfs_key tmp_key; + +			btrfs_item_key_to_cpu(leaf, &tmp_key, +					      path->slots[0] + 1); +			if (tmp_key.objectid == btrfs_ino(inode) && +			    tmp_key.type == BTRFS_EXTENT_DATA_KEY && +			    tmp_key.offset <= em->start + em->len) { +				if (!del_nr) +					del_slot = path->slots[0]; +				del_nr++; +				continue; +			} +		} + +		BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); +		memcpy(&new_key, &key, sizeof(new_key)); +		new_key.offset = em->start + em->len; +		btrfs_set_item_key_safe(trans, root, path, &new_key); +		extent_offset += em->start + em->len - key.offset; +		btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, +						   &token); +		btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - +						      (em->start + em->len), +						      &token); +		btrfs_mark_buffer_dirty(leaf); +	} + +	if (del_nr) +		ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + +	return ret; +}  static int log_one_extent(struct btrfs_trans_handle *trans,  			  struct inode *inode, struct btrfs_root *root, -			  struct extent_map *em, struct btrfs_path *path, -			  struct btrfs_path *dst_path, struct log_args *args) +			  struct extent_map *em, struct btrfs_path *path)  {  	struct btrfs_root *log = root->log_root;  	struct btrfs_file_extent_item *fi; +	struct extent_buffer *leaf; +	struct list_head ordered_sums; +	struct btrfs_map_token token;  	struct btrfs_key key; -	u64 start = em->mod_start; -	u64 search_start = start; -	u64 len = em->mod_len; -	u64 num_bytes; -	int nritems; +	u64 csum_offset = em->mod_start - em->start; +	u64 csum_len = em->mod_len; +	u64 extent_offset = em->start - em->orig_start; +	u64 block_len;  	int ret; +	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -	if (BTRFS_I(inode)->logged_trans == trans->transid) { -		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, -					   start + len, NULL, 0); -		if (ret) -			return ret; +	INIT_LIST_HEAD(&ordered_sums); +	btrfs_init_map_token(&token); +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = em->start; +	path->really_keep_locks = 1; + +	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); +	if (ret && ret != -EEXIST) { +		path->really_keep_locks = 0; +		return ret;  	} +	leaf = path->nodes[0]; +	fi = btrfs_item_ptr(leaf, path->slots[0], +			    struct btrfs_file_extent_item); +	btrfs_set_token_file_extent_generation(leaf, fi, em->generation, +					       &token); +	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { +		skip_csum = true; +		btrfs_set_token_file_extent_type(leaf, fi, +						 BTRFS_FILE_EXTENT_PREALLOC, +						 &token); +	} else { +		btrfs_set_token_file_extent_type(leaf, fi, +						 BTRFS_FILE_EXTENT_REG, +						 &token); +		if (em->block_start == 0) +			skip_csum = true; +	} + +	block_len = max(em->block_len, em->orig_block_len); +	if (em->compress_type != BTRFS_COMPRESS_NONE) { +		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, +							em->block_start, +							&token); +		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, +							   &token); +	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) { +		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, +							em->block_start - +							extent_offset, &token); +		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, +							   &token); +	} else { +		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); +		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, +							   &token); +	} + +	btrfs_set_token_file_extent_offset(leaf, fi, +					   em->start - em->orig_start, +					   &token); +	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); +	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); +	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, +						&token); +	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); +	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); +	btrfs_mark_buffer_dirty(leaf); -	while (len) { -		if (args->nr) -			goto next_slot; -again: -		key.objectid = btrfs_ino(inode); -		key.type = BTRFS_EXTENT_DATA_KEY; -		key.offset = search_start; - -		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); -		if (ret < 0) -			return ret; - -		if (ret) { -			/* -			 * A rare case were we can have an em for a section of a -			 * larger extent so we need to make sure that this em -			 * falls within the extent we've found.  If not we just -			 * bail and go back to ye-olde way of doing things but -			 * it happens often enough in testing that we need to do -			 * this dance to make sure. -			 */ -			do { -				if (path->slots[0] == 0) { -					btrfs_release_path(path); -					if (search_start == 0) -						return -ENOENT; -					search_start--; -					goto again; -				} - -				path->slots[0]--; -				btrfs_item_key_to_cpu(path->nodes[0], &key, -						      path->slots[0]); -				if (key.objectid != btrfs_ino(inode) || -				    key.type != BTRFS_EXTENT_DATA_KEY) { -					btrfs_release_path(path); -					return -ENOENT; -				} -			} while (key.offset > start); +	/* +	 * Have to check the extent to the right of us to make sure it doesn't +	 * fall in our current range.  We're ok if the previous extent is in our +	 * range since the recovery stuff will run us in key order and thus just +	 * drop the part we overwrote. +	 */ +	ret = drop_adjacent_extents(trans, log, inode, em, path); +	btrfs_release_path(path); +	path->really_keep_locks = 0; +	if (ret) { +		return ret; +	} -			fi = btrfs_item_ptr(path->nodes[0], path->slots[0], -					    struct btrfs_file_extent_item); -			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], -								fi); -			if (key.offset + num_bytes <= start) { -				btrfs_release_path(path); -				return -ENOENT; -			} -		} -		args->src = path->nodes[0]; -next_slot: -		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); -		fi = btrfs_item_ptr(args->src, path->slots[0], -				    struct btrfs_file_extent_item); -		if (args->nr && -		    args->start_slot + args->nr == path->slots[0]) { -			args->nr++; -		} else if (args->nr) { -			ret = copy_items(trans, inode, dst_path, args->src, -					 args->start_slot, args->nr, -					 LOG_INODE_ALL); -			if (ret) -				return ret; -			args->nr = 1; -			args->start_slot = path->slots[0]; -		} else if (!args->nr) { -			args->nr = 1; -			args->start_slot = path->slots[0]; -		} -		nritems = btrfs_header_nritems(path->nodes[0]); -		path->slots[0]++; -		num_bytes = btrfs_file_extent_num_bytes(args->src, fi); -		if (len < num_bytes) { -			/* I _think_ this is ok, envision we write to a -			 * preallocated space that is adjacent to a previously -			 * written preallocated space that gets merged when we -			 * mark this preallocated space written.  If we do not -			 * have the adjacent extent in cache then when we copy -			 * this extent it could end up being larger than our EM -			 * thinks it is, which is a-ok, so just set len to 0. -			 */ -			len = 0; -		} else { -			len -= num_bytes; -		} -		start = key.offset + num_bytes; -		args->next_offset = start; -		search_start = start; +	if (skip_csum) +		return 0; -		if (path->slots[0] < nritems) { -			if (len) -				goto next_slot; -			break; -		} +	/* block start is already adjusted for the file extent offset. */ +	ret = btrfs_lookup_csums_range(log->fs_info->csum_root, +				       em->block_start + csum_offset, +				       em->block_start + csum_offset + +				       csum_len - 1, &ordered_sums, 0); +	if (ret) +		return ret; -		if (args->nr) { -			ret = copy_items(trans, inode, dst_path, args->src, -					 args->start_slot, args->nr, -					 LOG_INODE_ALL); -			if (ret) -				return ret; -			args->nr = 0; -			btrfs_release_path(path); -		} +	while (!list_empty(&ordered_sums)) { +		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, +						   struct btrfs_ordered_sum, +						   list); +		if (!ret) +			ret = btrfs_csum_file_blocks(trans, log, sums); +		list_del(&sums->list); +		kfree(sums);  	} -	return 0; +	return ret;  }  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  				     struct btrfs_root *root,  				     struct inode *inode, -				     struct btrfs_path *path, -				     struct btrfs_path *dst_path) +				     struct btrfs_path *path)  { -	struct log_args args;  	struct extent_map *em, *n;  	struct list_head extents;  	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; @@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  	INIT_LIST_HEAD(&extents); -	memset(&args, 0, sizeof(args)); -  	write_lock(&tree->lock);  	test_gen = root->fs_info->last_trans_committed; @@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,  		write_unlock(&tree->lock); -		/* -		 * If the previous EM and the last extent we left off on aren't -		 * sequential then we need to copy the items we have and redo -		 * our search -		 */ -		if (args.nr && em->mod_start != args.next_offset) { -			ret = copy_items(trans, inode, dst_path, args.src, -					 args.start_slot, args.nr, -					 LOG_INODE_ALL); -			if (ret) { -				free_extent_map(em); -				write_lock(&tree->lock); -				continue; -			} -			btrfs_release_path(path); -			args.nr = 0; -		} - -		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); +		ret = log_one_extent(trans, inode, root, em, path);  		free_extent_map(em);  		write_lock(&tree->lock);  	}  	WARN_ON(!list_empty(&extents));  	write_unlock(&tree->lock); -	if (!ret && args.nr) -		ret = copy_items(trans, inode, dst_path, args.src, -				 args.start_slot, args.nr, LOG_INODE_ALL);  	btrfs_release_path(path);  	return ret;  } @@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	/* today the code can only do partial logging of directories */ -	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) +	if (S_ISDIR(inode->i_mode) || +	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, +		       &BTRFS_I(inode)->runtime_flags) && +	     inode_only == LOG_INODE_EXISTS))  		max_key.type = BTRFS_XATTR_ITEM_KEY;  	else  		max_key.type = (u8)-1; @@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,  	} else {  		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,  				       &BTRFS_I(inode)->runtime_flags)) { +			clear_bit(BTRFS_INODE_COPY_EVERYTHING, +				  &BTRFS_I(inode)->runtime_flags);  			ret = btrfs_truncate_inode_items(trans, log,  							 inode, 0, 0); -		} else { -			fast_search = true; +		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, +					      &BTRFS_I(inode)->runtime_flags)) { +			if (inode_only == LOG_INODE_ALL) +				fast_search = true;  			max_key.type = BTRFS_XATTR_ITEM_KEY;  			ret = drop_objectid_items(trans, log, path, ino, -						  BTRFS_XATTR_ITEM_KEY); +						  max_key.type); +		} else { +			if (inode_only == LOG_INODE_ALL) +				fast_search = true; +			ret = log_inode_item(trans, log, dst_path, inode); +			if (ret) { +				err = ret; +				goto out_unlock; +			} +			goto log_extents;  		} +  	}  	if (ret) {  		err = ret; @@ -3518,11 +3620,10 @@ next_slot:  		ins_nr = 0;  	} +log_extents:  	if (fast_search) { -		btrfs_release_path(path);  		btrfs_release_path(dst_path); -		ret = btrfs_log_changed_extents(trans, root, inode, path, -						dst_path); +		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);  		if (ret) {  			err = ret;  			goto out_unlock; @@ -3531,8 +3632,10 @@ next_slot:  		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;  		struct extent_map *em, *n; +		write_lock(&tree->lock);  		list_for_each_entry_safe(em, n, &tree->modified_extents, list)  			list_del_init(&em->list); +		write_unlock(&tree->lock);  	}  	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0f5ebb72a5ea..5cce6aa74012 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,7 +25,6 @@  #include <linux/capability.h>  #include <linux/ratelimit.h>  #include <linux/kthread.h> -#include <asm/div64.h>  #include "compat.h"  #include "ctree.h"  #include "extent_map.h" @@ -36,6 +35,8 @@  #include "async-thread.h"  #include "check-integrity.h"  #include "rcu-string.h" +#include "math.h" +#include "dev-replace.h"  static int init_first_rw_device(struct btrfs_trans_handle *trans,  				struct btrfs_root *root, @@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)  	kfree(fs_devices);  } +static void btrfs_kobject_uevent(struct block_device *bdev, +				 enum kobject_action action) +{ +	int ret; + +	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); +	if (ret) +		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", +			action, +			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), +			&disk_to_dev(bdev->bd_disk)->kobj); +} +  void btrfs_cleanup_fs_uuids(void)  {  	struct btrfs_fs_devices *fs_devices; @@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)  	return NULL;  } +static int +btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, +		      int flush, struct block_device **bdev, +		      struct buffer_head **bh) +{ +	int ret; + +	*bdev = blkdev_get_by_path(device_path, flags, holder); + +	if (IS_ERR(*bdev)) { +		ret = PTR_ERR(*bdev); +		printk(KERN_INFO "btrfs: open %s failed\n", device_path); +		goto error; +	} + +	if (flush) +		filemap_write_and_wait((*bdev)->bd_inode->i_mapping); +	ret = set_blocksize(*bdev, 4096); +	if (ret) { +		blkdev_put(*bdev, flags); +		goto error; +	} +	invalidate_bdev(*bdev); +	*bh = btrfs_read_dev_super(*bdev); +	if (!*bh) { +		ret = -EINVAL; +		blkdev_put(*bdev, flags); +		goto error; +	} + +	return 0; + +error: +	*bdev = NULL; +	*bh = NULL; +	return ret; +} +  static void requeue_list(struct btrfs_pending_bios *pending_bios,  			struct bio *head, struct bio *tail)  { @@ -467,7 +519,8 @@ error:  	return ERR_PTR(-ENOMEM);  } -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, +			       struct btrfs_fs_devices *fs_devices, int step)  {  	struct btrfs_device *device, *next; @@ -480,8 +533,9 @@ again:  	/* This is the initialized path, it is safe to release the devices. */  	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {  		if (device->in_fs_metadata) { -			if (!latest_transid || -			    device->generation > latest_transid) { +			if (!device->is_tgtdev_for_dev_replace && +			    (!latest_transid || +			     device->generation > latest_transid)) {  				latest_devid = device->devid;  				latest_transid = device->generation;  				latest_bdev = device->bdev; @@ -489,6 +543,21 @@ again:  			continue;  		} +		if (device->devid == BTRFS_DEV_REPLACE_DEVID) { +			/* +			 * In the first step, keep the device which has +			 * the correct fsid and the devid that is used +			 * for the dev_replace procedure. +			 * In the second step, the dev_replace state is +			 * read from the device tree and it is known +			 * whether the procedure is really active or +			 * not, which means whether this device is +			 * used or whether it should be removed. +			 */ +			if (step == 0 || device->is_tgtdev_for_dev_replace) { +				continue; +			} +		}  		if (device->bdev) {  			blkdev_put(device->bdev, device->mode);  			device->bdev = NULL; @@ -497,7 +566,8 @@ again:  		if (device->writeable) {  			list_del_init(&device->dev_alloc_list);  			device->writeable = 0; -			fs_devices->rw_devices--; +			if (!device->is_tgtdev_for_dev_replace) +				fs_devices->rw_devices--;  		}  		list_del_init(&device->dev_list);  		fs_devices->num_devices--; @@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)  		if (device->bdev)  			fs_devices->open_devices--; -		if (device->writeable) { +		if (device->writeable && !device->is_tgtdev_for_dev_replace) {  			list_del_init(&device->dev_alloc_list);  			fs_devices->rw_devices--;  		} @@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  		if (!device->name)  			continue; -		bdev = blkdev_get_by_path(device->name->str, flags, holder); -		if (IS_ERR(bdev)) { -			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); -			goto error; -		} -		filemap_write_and_wait(bdev->bd_inode->i_mapping); -		invalidate_bdev(bdev); -		set_blocksize(bdev, 4096); - -		bh = btrfs_read_dev_super(bdev); -		if (!bh) -			goto error_close; +		ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, +					    &bdev, &bh); +		if (ret) +			continue;  		disk_super = (struct btrfs_super_block *)bh->b_data;  		devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  			fs_devices->rotating = 1;  		fs_devices->open_devices++; -		if (device->writeable) { +		if (device->writeable && !device->is_tgtdev_for_dev_replace) {  			fs_devices->rw_devices++;  			list_add(&device->dev_alloc_list,  				 &fs_devices->alloc_list); @@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  error_brelse:  		brelse(bh); -error_close:  		blkdev_put(bdev, flags); -error:  		continue;  	}  	if (fs_devices->open_devices == 0) { @@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  	u64 total_devices;  	flags |= FMODE_EXCL; -	bdev = blkdev_get_by_path(path, flags, holder); - -	if (IS_ERR(bdev)) { -		ret = PTR_ERR(bdev); -		goto error; -	} -  	mutex_lock(&uuid_mutex); -	ret = set_blocksize(bdev, 4096); +	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);  	if (ret) -		goto error_close; -	bh = btrfs_read_dev_super(bdev); -	if (!bh) { -		ret = -EINVAL; -		goto error_close; -	} +		goto error;  	disk_super = (struct btrfs_super_block *)bh->b_data;  	devid = btrfs_stack_device_id(&disk_super->dev_item);  	transid = btrfs_super_generation(disk_super);  	total_devices = btrfs_super_num_devices(disk_super); -	if (disk_super->label[0]) +	if (disk_super->label[0]) { +		if (disk_super->label[BTRFS_LABEL_SIZE - 1]) +			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';  		printk(KERN_INFO "device label %s ", disk_super->label); -	else +	} else {  		printk(KERN_INFO "device fsid %pU ", disk_super->fsid); +	}  	printk(KERN_CONT "devid %llu transid %llu %s\n",  	       (unsigned long long)devid, (unsigned long long)transid, path);  	ret = device_list_add(path, disk_super, devid, fs_devices_ret);  	if (!ret && fs_devices_ret)  		(*fs_devices_ret)->total_devices = total_devices;  	brelse(bh); -error_close: -	mutex_unlock(&uuid_mutex);  	blkdev_put(bdev, flags);  error: +	mutex_unlock(&uuid_mutex);  	return ret;  } @@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,  	*length = 0; -	if (start >= device->total_bytes) +	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)  		return 0;  	path = btrfs_alloc_path(); @@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,  	max_hole_size = 0;  	hole_size = 0; -	if (search_start >= search_end) { +	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {  		ret = -ENOSPC;  		goto error;  	} @@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,  	struct btrfs_key key;  	WARN_ON(!device->in_fs_metadata); +	WARN_ON(device->is_tgtdev_for_dev_replace);  	path = btrfs_alloc_path();  	if (!path)  		return -ENOMEM; @@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		root->fs_info->avail_system_alloc_bits |  		root->fs_info->avail_metadata_alloc_bits; -	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && -	    root->fs_info->fs_devices->num_devices <= 4) { +	num_devices = root->fs_info->fs_devices->num_devices; +	btrfs_dev_replace_lock(&root->fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { +		WARN_ON(num_devices < 1); +		num_devices--; +	} +	btrfs_dev_replace_unlock(&root->fs_info->dev_replace); + +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {  		printk(KERN_ERR "btrfs: unable to go below four devices "  		       "on raid10\n");  		ret = -EINVAL;  		goto out;  	} -	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && -	    root->fs_info->fs_devices->num_devices <= 2) { +	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {  		printk(KERN_ERR "btrfs: unable to go below two "  		       "devices on raid1\n");  		ret = -EINVAL; @@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		 * is held.  		 */  		list_for_each_entry(tmp, devices, dev_list) { -			if (tmp->in_fs_metadata && !tmp->bdev) { +			if (tmp->in_fs_metadata && +			    !tmp->is_tgtdev_for_dev_replace && +			    !tmp->bdev) {  				device = tmp;  				break;  			} @@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  			goto out;  		}  	} else { -		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, -					  root->fs_info->bdev_holder); -		if (IS_ERR(bdev)) { -			ret = PTR_ERR(bdev); +		ret = btrfs_get_bdev_and_sb(device_path, +					    FMODE_READ | FMODE_EXCL, +					    root->fs_info->bdev_holder, 0, +					    &bdev, &bh); +		if (ret)  			goto out; -		} - -		set_blocksize(bdev, 4096); -		invalidate_bdev(bdev); -		bh = btrfs_read_dev_super(bdev); -		if (!bh) { -			ret = -EINVAL; -			goto error_close; -		}  		disk_super = (struct btrfs_super_block *)bh->b_data;  		devid = btrfs_stack_device_id(&disk_super->dev_item);  		dev_uuid = disk_super->dev_item.uuid; -		device = btrfs_find_device(root, devid, dev_uuid, +		device = btrfs_find_device(root->fs_info, devid, dev_uuid,  					   disk_super->fsid);  		if (!device) {  			ret = -ENOENT; @@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  		}  	} +	if (device->is_tgtdev_for_dev_replace) { +		pr_err("btrfs: unable to remove the dev_replace target dev\n"); +		ret = -EINVAL; +		goto error_brelse; +	} +  	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {  		printk(KERN_ERR "btrfs: unable to remove the only writeable "  		       "device\n"); @@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	if (ret)  		goto error_undo; +	/* +	 * TODO: the superblock still includes this device in its num_devices +	 * counter although write_all_supers() is not locked out. This +	 * could give a filesystem state which requires a degraded mount. +	 */  	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);  	if (ret)  		goto error_undo; @@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	spin_unlock(&root->fs_info->free_chunk_lock);  	device->in_fs_metadata = 0; -	btrfs_scrub_cancel_dev(root, device); +	btrfs_scrub_cancel_dev(root->fs_info, device);  	/*  	 * the device list mutex makes sure that we don't change @@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	 * at this point, the device is zero sized.  We want to  	 * remove it from the devices list and zero out the old super  	 */ -	if (clear_super) { +	if (clear_super && disk_super) {  		/* make sure this device isn't detected as part of  		 * the FS anymore  		 */ @@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)  	ret = 0; +	/* Notify udev that device has changed */ +	btrfs_kobject_uevent(bdev, KOBJ_CHANGE); +  error_brelse:  	brelse(bh); -error_close:  	if (bdev)  		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);  out: @@ -1512,6 +1576,112 @@ error_undo:  	goto error_brelse;  } +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, +				 struct btrfs_device *srcdev) +{ +	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); +	list_del_rcu(&srcdev->dev_list); +	list_del_rcu(&srcdev->dev_alloc_list); +	fs_info->fs_devices->num_devices--; +	if (srcdev->missing) { +		fs_info->fs_devices->missing_devices--; +		fs_info->fs_devices->rw_devices++; +	} +	if (srcdev->can_discard) +		fs_info->fs_devices->num_can_discard--; +	if (srcdev->bdev) +		fs_info->fs_devices->open_devices--; + +	call_rcu(&srcdev->rcu, free_device); +} + +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, +				      struct btrfs_device *tgtdev) +{ +	struct btrfs_device *next_device; + +	WARN_ON(!tgtdev); +	mutex_lock(&fs_info->fs_devices->device_list_mutex); +	if (tgtdev->bdev) { +		btrfs_scratch_superblock(tgtdev); +		fs_info->fs_devices->open_devices--; +	} +	fs_info->fs_devices->num_devices--; +	if (tgtdev->can_discard) +		fs_info->fs_devices->num_can_discard++; + +	next_device = list_entry(fs_info->fs_devices->devices.next, +				 struct btrfs_device, dev_list); +	if (tgtdev->bdev == fs_info->sb->s_bdev) +		fs_info->sb->s_bdev = next_device->bdev; +	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) +		fs_info->fs_devices->latest_bdev = next_device->bdev; +	list_del_rcu(&tgtdev->dev_list); + +	call_rcu(&tgtdev->rcu, free_device); + +	mutex_unlock(&fs_info->fs_devices->device_list_mutex); +} + +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, +			      struct btrfs_device **device) +{ +	int ret = 0; +	struct btrfs_super_block *disk_super; +	u64 devid; +	u8 *dev_uuid; +	struct block_device *bdev; +	struct buffer_head *bh; + +	*device = NULL; +	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, +				    root->fs_info->bdev_holder, 0, &bdev, &bh); +	if (ret) +		return ret; +	disk_super = (struct btrfs_super_block *)bh->b_data; +	devid = btrfs_stack_device_id(&disk_super->dev_item); +	dev_uuid = disk_super->dev_item.uuid; +	*device = btrfs_find_device(root->fs_info, devid, dev_uuid, +				    disk_super->fsid); +	brelse(bh); +	if (!*device) +		ret = -ENOENT; +	blkdev_put(bdev, FMODE_READ); +	return ret; +} + +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, +					 char *device_path, +					 struct btrfs_device **device) +{ +	*device = NULL; +	if (strcmp(device_path, "missing") == 0) { +		struct list_head *devices; +		struct btrfs_device *tmp; + +		devices = &root->fs_info->fs_devices->devices; +		/* +		 * It is safe to read the devices since the volume_mutex +		 * is held by the caller. +		 */ +		list_for_each_entry(tmp, devices, dev_list) { +			if (tmp->in_fs_metadata && !tmp->bdev) { +				*device = tmp; +				break; +			} +		} + +		if (!*device) { +			pr_err("btrfs: no missing device found\n"); +			return -ENOENT; +		} + +		return 0; +	} else { +		return btrfs_find_device_by_path(root, device_path, device); +	} +} +  /*   * does all the dirty work required for changing file system's UUID.   */ @@ -1630,7 +1800,8 @@ next_slot:  		read_extent_buffer(leaf, fs_uuid,  				   (unsigned long)btrfs_device_fsid(dev_item),  				   BTRFS_UUID_SIZE); -		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); +		device = btrfs_find_device(root->fs_info, devid, dev_uuid, +					   fs_uuid);  		BUG_ON(!device); /* Logic error */  		if (device->fs_devices->seeding) { @@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	filemap_write_and_wait(bdev->bd_inode->i_mapping);  	devices = &root->fs_info->fs_devices->devices; -	/* -	 * we have the volume lock, so we don't need the extra -	 * device list mutex while reading the list here. -	 */ + +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);  	list_for_each_entry(device, devices, dev_list) {  		if (device->bdev == bdev) {  			ret = -EEXIST; +			mutex_unlock( +				&root->fs_info->fs_devices->device_list_mutex);  			goto error;  		}  	} +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);  	device = kzalloc(sizeof(*device), GFP_NOFS);  	if (!device) { @@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)  	device->dev_root = root->fs_info->dev_root;  	device->bdev = bdev;  	device->in_fs_metadata = 1; +	device->is_tgtdev_for_dev_replace = 0;  	device->mode = FMODE_EXCL;  	set_blocksize(device->bdev, 4096); @@ -1844,6 +2017,98 @@ error:  	return ret;  } +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, +				  struct btrfs_device **device_out) +{ +	struct request_queue *q; +	struct btrfs_device *device; +	struct block_device *bdev; +	struct btrfs_fs_info *fs_info = root->fs_info; +	struct list_head *devices; +	struct rcu_string *name; +	int ret = 0; + +	*device_out = NULL; +	if (fs_info->fs_devices->seeding) +		return -EINVAL; + +	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, +				  fs_info->bdev_holder); +	if (IS_ERR(bdev)) +		return PTR_ERR(bdev); + +	filemap_write_and_wait(bdev->bd_inode->i_mapping); + +	devices = &fs_info->fs_devices->devices; +	list_for_each_entry(device, devices, dev_list) { +		if (device->bdev == bdev) { +			ret = -EEXIST; +			goto error; +		} +	} + +	device = kzalloc(sizeof(*device), GFP_NOFS); +	if (!device) { +		ret = -ENOMEM; +		goto error; +	} + +	name = rcu_string_strdup(device_path, GFP_NOFS); +	if (!name) { +		kfree(device); +		ret = -ENOMEM; +		goto error; +	} +	rcu_assign_pointer(device->name, name); + +	q = bdev_get_queue(bdev); +	if (blk_queue_discard(q)) +		device->can_discard = 1; +	mutex_lock(&root->fs_info->fs_devices->device_list_mutex); +	device->writeable = 1; +	device->work.func = pending_bios_fn; +	generate_random_uuid(device->uuid); +	device->devid = BTRFS_DEV_REPLACE_DEVID; +	spin_lock_init(&device->io_lock); +	device->generation = 0; +	device->io_width = root->sectorsize; +	device->io_align = root->sectorsize; +	device->sector_size = root->sectorsize; +	device->total_bytes = i_size_read(bdev->bd_inode); +	device->disk_total_bytes = device->total_bytes; +	device->dev_root = fs_info->dev_root; +	device->bdev = bdev; +	device->in_fs_metadata = 1; +	device->is_tgtdev_for_dev_replace = 1; +	device->mode = FMODE_EXCL; +	set_blocksize(device->bdev, 4096); +	device->fs_devices = fs_info->fs_devices; +	list_add(&device->dev_list, &fs_info->fs_devices->devices); +	fs_info->fs_devices->num_devices++; +	fs_info->fs_devices->open_devices++; +	if (device->can_discard) +		fs_info->fs_devices->num_can_discard++; +	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + +	*device_out = device; +	return ret; + +error: +	blkdev_put(bdev, FMODE_EXCL); +	return ret; +} + +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, +					      struct btrfs_device *tgtdev) +{ +	WARN_ON(fs_info->fs_devices->rw_devices == 0); +	tgtdev->io_width = fs_info->dev_root->sectorsize; +	tgtdev->io_align = fs_info->dev_root->sectorsize; +	tgtdev->sector_size = fs_info->dev_root->sectorsize; +	tgtdev->dev_root = fs_info->dev_root; +	tgtdev->in_fs_metadata = 1; +} +  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,  					struct btrfs_device *device)  { @@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,  	if (!device->writeable)  		return -EACCES; -	if (new_size <= device->total_bytes) +	if (new_size <= device->total_bytes || +	    device->is_tgtdev_for_dev_replace)  		return -EINVAL;  	btrfs_set_super_total_bytes(super_copy, old_total + diff); @@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,  	return 1;  } -static u64 div_factor_fine(u64 num, int factor) -{ -	if (factor <= 0) -		return 0; -	if (factor >= 100) -		return num; - -	num *= factor; -	do_div(num, 100); -	return num; -} -  static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,  			      struct btrfs_balance_args *bargs)  { @@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,  	return 1;  } -static u64 div_factor(u64 num, int factor) -{ -	if (factor == 10) -		return num; -	num *= factor; -	do_div(num, 10); -	return num; -} -  static int __btrfs_balance(struct btrfs_fs_info *fs_info)  {  	struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)  		size_to_free = div_factor(old_size, 1);  		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);  		if (!device->writeable || -		    device->total_bytes - device->bytes_used > size_to_free) +		    device->total_bytes - device->bytes_used > size_to_free || +		    device->is_tgtdev_for_dev_replace)  			continue;  		ret = btrfs_shrink_device(device, old_size - size_to_free); @@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  	u64 allowed;  	int mixed = 0;  	int ret; +	u64 num_devices;  	if (btrfs_fs_closing(fs_info) ||  	    atomic_read(&fs_info->balance_pause_req) || @@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,  		}  	} +	num_devices = fs_info->fs_devices->num_devices; +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { +		BUG_ON(num_devices < 1); +		num_devices--; +	} +	btrfs_dev_replace_unlock(&fs_info->dev_replace);  	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; -	if (fs_info->fs_devices->num_devices == 1) +	if (num_devices == 1)  		allowed |= BTRFS_BLOCK_GROUP_DUP; -	else if (fs_info->fs_devices->num_devices < 4) +	else if (num_devices < 4)  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);  	else  		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | @@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)  		ret = btrfs_balance(fs_info->balance_ctl, NULL);  	} +	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);  	mutex_unlock(&fs_info->balance_mutex);  	mutex_unlock(&fs_info->volume_mutex); @@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)  		return 0;  	} +	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));  	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");  	if (IS_ERR(tsk))  		return PTR_ERR(tsk); @@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)  	u64 old_size = device->total_bytes;  	u64 diff = device->total_bytes - new_size; -	if (new_size >= device->total_bytes) +	if (device->is_tgtdev_for_dev_replace)  		return -EINVAL;  	path = btrfs_alloc_path(); @@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)  	return 0;  } +struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +	{ 2, 1, 0, 4, 2, 2 /* raid10 */ }, +	{ 1, 1, 2, 2, 2, 2 /* raid1 */ }, +	{ 1, 2, 1, 1, 1, 2 /* dup */ }, +	{ 1, 1, 0, 2, 1, 1 /* raid0 */ }, +	{ 1, 1, 0, 1, 1, 1 /* single */ }, +}; +  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  			       struct btrfs_root *extent_root,  			       struct map_lookup **map_ret, @@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  	int ndevs;  	int i;  	int j; +	int index;  	BUG_ON(!alloc_profile_is_valid(type, 0));  	if (list_empty(&fs_devices->alloc_list))  		return -ENOSPC; -	sub_stripes = 1; -	dev_stripes = 1; -	devs_increment = 1; -	ncopies = 1; -	devs_max = 0;	/* 0 == as many as possible */ -	devs_min = 1; +	index = __get_raid_index(type); -	/* -	 * define the properties of each RAID type. -	 * FIXME: move this to a global table and use it in all RAID -	 * calculation code -	 */ -	if (type & (BTRFS_BLOCK_GROUP_DUP)) { -		dev_stripes = 2; -		ncopies = 2; -		devs_max = 1; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { -		devs_min = 2; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { -		devs_increment = 2; -		ncopies = 2; -		devs_max = 2; -		devs_min = 2; -	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { -		sub_stripes = 2; -		devs_increment = 2; -		ncopies = 2; -		devs_min = 4; -	} else { -		devs_max = 1; -	} +	sub_stripes = btrfs_raid_array[index].sub_stripes; +	dev_stripes = btrfs_raid_array[index].dev_stripes; +	devs_max = btrfs_raid_array[index].devs_max; +	devs_min = btrfs_raid_array[index].devs_min; +	devs_increment = btrfs_raid_array[index].devs_increment; +	ncopies = btrfs_raid_array[index].ncopies;  	if (type & BTRFS_BLOCK_GROUP_DATA) {  		max_stripe_size = 1024 * 1024 * 1024; @@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		cur = cur->next;  		if (!device->writeable) { -			printk(KERN_ERR +			WARN(1, KERN_ERR  			       "btrfs: read-only device in alloc_list\n"); -			WARN_ON(1);  			continue;  		} -		if (!device->in_fs_metadata) +		if (!device->in_fs_metadata || +		    device->is_tgtdev_for_dev_replace)  			continue;  		if (device->total_bytes > device->bytes_used) @@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,  		devices_info[ndevs].total_avail = total_avail;  		devices_info[ndevs].dev = device;  		++ndevs; +		WARN_ON(ndevs > fs_devices->rw_devices);  	}  	/* @@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)  	}  } -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)  { +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	struct extent_map *em;  	struct map_lookup *map;  	struct extent_map_tree *em_tree = &map_tree->map_tree; @@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)  	else  		ret = 1;  	free_extent_map(em); + +	btrfs_dev_replace_lock(&fs_info->dev_replace); +	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) +		ret++; +	btrfs_dev_replace_unlock(&fs_info->dev_replace); +  	return ret;  } -static int find_live_mirror(struct map_lookup *map, int first, int num, -			    int optimal) +static int find_live_mirror(struct btrfs_fs_info *fs_info, +			    struct map_lookup *map, int first, int num, +			    int optimal, int dev_replace_is_ongoing)  {  	int i; -	if (map->stripes[optimal].dev->bdev) -		return optimal; -	for (i = first; i < first + num; i++) { -		if (map->stripes[i].dev->bdev) -			return i; +	int tolerance; +	struct btrfs_device *srcdev; + +	if (dev_replace_is_ongoing && +	    fs_info->dev_replace.cont_reading_from_srcdev_mode == +	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) +		srcdev = fs_info->dev_replace.srcdev; +	else +		srcdev = NULL; + +	/* +	 * try to avoid the drive that is the source drive for a +	 * dev-replace procedure, only choose it if no other non-missing +	 * mirror is available +	 */ +	for (tolerance = 0; tolerance < 2; tolerance++) { +		if (map->stripes[optimal].dev->bdev && +		    (tolerance || map->stripes[optimal].dev != srcdev)) +			return optimal; +		for (i = first; i < first + num; i++) { +			if (map->stripes[i].dev->bdev && +			    (tolerance || map->stripes[i].dev != srcdev)) +				return i; +		}  	} +  	/* we couldn't find one that doesn't fail.  Just return something  	 * and the io error handling code will clean up eventually  	 */  	return optimal;  } -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  			     u64 logical, u64 *length,  			     struct btrfs_bio **bbio_ret,  			     int mirror_num)  {  	struct extent_map *em;  	struct map_lookup *map; +	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;  	struct extent_map_tree *em_tree = &map_tree->map_tree;  	u64 offset;  	u64 stripe_offset; @@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	int num_stripes;  	int max_errors = 0;  	struct btrfs_bio *bbio = NULL; +	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; +	int dev_replace_is_ongoing = 0; +	int num_alloc_stripes; +	int patch_the_first_stripe_for_dev_replace = 0; +	u64 physical_to_patch_in_first_stripe = 0;  	read_lock(&em_tree->lock);  	em = lookup_extent_mapping(em_tree, logical, *length); @@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	map = (struct map_lookup *)em->bdev;  	offset = logical - em->start; -	if (mirror_num > map->num_stripes) -		mirror_num = 0; -  	stripe_nr = offset;  	/*  	 * stripe_nr counts the total number of stripes we have to stride @@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	if (!bbio_ret)  		goto out; +	btrfs_dev_replace_lock(dev_replace); +	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); +	if (!dev_replace_is_ongoing) +		btrfs_dev_replace_unlock(dev_replace); + +	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && +	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && +	    dev_replace->tgtdev != NULL) { +		/* +		 * in dev-replace case, for repair case (that's the only +		 * case where the mirror is selected explicitly when +		 * calling btrfs_map_block), blocks left of the left cursor +		 * can also be read from the target drive. +		 * For REQ_GET_READ_MIRRORS, the target drive is added as +		 * the last one to the array of stripes. For READ, it also +		 * needs to be supported using the same mirror number. +		 * If the requested block is not left of the left cursor, +		 * EIO is returned. This can happen because btrfs_num_copies() +		 * returns one more in the dev-replace case. +		 */ +		u64 tmp_length = *length; +		struct btrfs_bio *tmp_bbio = NULL; +		int tmp_num_stripes; +		u64 srcdev_devid = dev_replace->srcdev->devid; +		int index_srcdev = 0; +		int found = 0; +		u64 physical_of_found = 0; + +		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, +			     logical, &tmp_length, &tmp_bbio, 0); +		if (ret) { +			WARN_ON(tmp_bbio != NULL); +			goto out; +		} + +		tmp_num_stripes = tmp_bbio->num_stripes; +		if (mirror_num > tmp_num_stripes) { +			/* +			 * REQ_GET_READ_MIRRORS does not contain this +			 * mirror, that means that the requested area +			 * is not left of the left cursor +			 */ +			ret = -EIO; +			kfree(tmp_bbio); +			goto out; +		} + +		/* +		 * process the rest of the function using the mirror_num +		 * of the source drive. Therefore look it up first. +		 * At the end, patch the device pointer to the one of the +		 * target drive. +		 */ +		for (i = 0; i < tmp_num_stripes; i++) { +			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { +				/* +				 * In case of DUP, in order to keep it +				 * simple, only add the mirror with the +				 * lowest physical address +				 */ +				if (found && +				    physical_of_found <= +				     tmp_bbio->stripes[i].physical) +					continue; +				index_srcdev = i; +				found = 1; +				physical_of_found = +					tmp_bbio->stripes[i].physical; +			} +		} + +		if (found) { +			mirror_num = index_srcdev + 1; +			patch_the_first_stripe_for_dev_replace = 1; +			physical_to_patch_in_first_stripe = physical_of_found; +		} else { +			WARN_ON(1); +			ret = -EIO; +			kfree(tmp_bbio); +			goto out; +		} + +		kfree(tmp_bbio); +	} else if (mirror_num > map->num_stripes) { +		mirror_num = 0; +	} +  	num_stripes = 1;  	stripe_index = 0;  	stripe_nr_orig = stripe_nr; @@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  					    stripe_nr_end - stripe_nr_orig);  		stripe_index = do_div(stripe_nr, map->num_stripes);  	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { -		if (rw & (REQ_WRITE | REQ_DISCARD)) +		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))  			num_stripes = map->num_stripes;  		else if (mirror_num)  			stripe_index = mirror_num - 1;  		else { -			stripe_index = find_live_mirror(map, 0, +			stripe_index = find_live_mirror(fs_info, map, 0,  					    map->num_stripes, -					    current->pid % map->num_stripes); +					    current->pid % map->num_stripes, +					    dev_replace_is_ongoing);  			mirror_num = stripe_index + 1;  		}  	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) { -		if (rw & (REQ_WRITE | REQ_DISCARD)) { +		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {  			num_stripes = map->num_stripes;  		} else if (mirror_num) {  			stripe_index = mirror_num - 1; @@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		stripe_index = do_div(stripe_nr, factor);  		stripe_index *= map->sub_stripes; -		if (rw & REQ_WRITE) +		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))  			num_stripes = map->sub_stripes;  		else if (rw & REQ_DISCARD)  			num_stripes = min_t(u64, map->sub_stripes * @@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  			stripe_index += mirror_num - 1;  		else {  			int old_stripe_index = stripe_index; -			stripe_index = find_live_mirror(map, stripe_index, +			stripe_index = find_live_mirror(fs_info, map, +					      stripe_index,  					      map->sub_stripes, stripe_index + -					      current->pid % map->sub_stripes); +					      current->pid % map->sub_stripes, +					      dev_replace_is_ongoing);  			mirror_num = stripe_index - old_stripe_index + 1;  		}  	} else { @@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  	}  	BUG_ON(stripe_index >= map->num_stripes); -	bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); +	num_alloc_stripes = num_stripes; +	if (dev_replace_is_ongoing) { +		if (rw & (REQ_WRITE | REQ_DISCARD)) +			num_alloc_stripes <<= 1; +		if (rw & REQ_GET_READ_MIRRORS) +			num_alloc_stripes++; +	} +	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);  	if (!bbio) {  		ret = -ENOMEM;  		goto out; @@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		}  	} -	if (rw & REQ_WRITE) { +	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {  		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |  				 BTRFS_BLOCK_GROUP_RAID10 |  				 BTRFS_BLOCK_GROUP_DUP)) { @@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,  		}  	} +	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && +	    dev_replace->tgtdev != NULL) { +		int index_where_to_add; +		u64 srcdev_devid = dev_replace->srcdev->devid; + +		/* +		 * duplicate the write operations while the dev replace +		 * procedure is running. Since the copying of the old disk +		 * to the new disk takes place at run time while the +		 * filesystem is mounted writable, the regular write +		 * operations to the old disk have to be duplicated to go +		 * to the new disk as well. +		 * Note that device->missing is handled by the caller, and +		 * that the write to the old disk is already set up in the +		 * stripes array. +		 */ +		index_where_to_add = num_stripes; +		for (i = 0; i < num_stripes; i++) { +			if (bbio->stripes[i].dev->devid == srcdev_devid) { +				/* write to new disk, too */ +				struct btrfs_bio_stripe *new = +					bbio->stripes + index_where_to_add; +				struct btrfs_bio_stripe *old = +					bbio->stripes + i; + +				new->physical = old->physical; +				new->length = old->length; +				new->dev = dev_replace->tgtdev; +				index_where_to_add++; +				max_errors++; +			} +		} +		num_stripes = index_where_to_add; +	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && +		   dev_replace->tgtdev != NULL) { +		u64 srcdev_devid = dev_replace->srcdev->devid; +		int index_srcdev = 0; +		int found = 0; +		u64 physical_of_found = 0; + +		/* +		 * During the dev-replace procedure, the target drive can +		 * also be used to read data in case it is needed to repair +		 * a corrupt block elsewhere. This is possible if the +		 * requested area is left of the left cursor. In this area, +		 * the target drive is a full copy of the source drive. +		 */ +		for (i = 0; i < num_stripes; i++) { +			if (bbio->stripes[i].dev->devid == srcdev_devid) { +				/* +				 * In case of DUP, in order to keep it +				 * simple, only add the mirror with the +				 * lowest physical address +				 */ +				if (found && +				    physical_of_found <= +				     bbio->stripes[i].physical) +					continue; +				index_srcdev = i; +				found = 1; +				physical_of_found = bbio->stripes[i].physical; +			} +		} +		if (found) { +			u64 length = map->stripe_len; + +			if (physical_of_found + length <= +			    dev_replace->cursor_left) { +				struct btrfs_bio_stripe *tgtdev_stripe = +					bbio->stripes + num_stripes; + +				tgtdev_stripe->physical = physical_of_found; +				tgtdev_stripe->length = +					bbio->stripes[index_srcdev].length; +				tgtdev_stripe->dev = dev_replace->tgtdev; + +				num_stripes++; +			} +		} +	} +  	*bbio_ret = bbio;  	bbio->num_stripes = num_stripes;  	bbio->max_errors = max_errors;  	bbio->mirror_num = mirror_num; + +	/* +	 * this is the case that REQ_READ && dev_replace_is_ongoing && +	 * mirror_num == num_stripes + 1 && dev_replace target drive is +	 * available as a mirror +	 */ +	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { +		WARN_ON(num_stripes > 1); +		bbio->stripes[0].dev = dev_replace->tgtdev; +		bbio->stripes[0].physical = physical_to_patch_in_first_stripe; +		bbio->mirror_num = map->num_stripes + 1; +	}  out: +	if (dev_replace_is_ongoing) +		btrfs_dev_replace_unlock(dev_replace);  	free_extent_map(em);  	return ret;  } -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		      u64 logical, u64 *length,  		      struct btrfs_bio **bbio_ret, int mirror_num)  { -	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, +	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,  				 mirror_num);  } @@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,  				   &device->work);  } +static int bio_size_ok(struct block_device *bdev, struct bio *bio, +		       sector_t sector) +{ +	struct bio_vec *prev; +	struct request_queue *q = bdev_get_queue(bdev); +	unsigned short max_sectors = queue_max_sectors(q); +	struct bvec_merge_data bvm = { +		.bi_bdev = bdev, +		.bi_sector = sector, +		.bi_rw = bio->bi_rw, +	}; + +	if (bio->bi_vcnt == 0) { +		WARN_ON(1); +		return 1; +	} + +	prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; +	if ((bio->bi_size >> 9) > max_sectors) +		return 0; + +	if (!q->merge_bvec_fn) +		return 1; + +	bvm.bi_size = bio->bi_size - prev->bv_len; +	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) +		return 0; +	return 1; +} + +static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, +			      struct bio *bio, u64 physical, int dev_nr, +			      int rw, int async) +{ +	struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + +	bio->bi_private = bbio; +	bio->bi_private = merge_stripe_index_into_bio_private( +			bio->bi_private, (unsigned int)dev_nr); +	bio->bi_end_io = btrfs_end_bio; +	bio->bi_sector = physical >> 9; +#ifdef DEBUG +	{ +		struct rcu_string *name; + +		rcu_read_lock(); +		name = rcu_dereference(dev->name); +		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " +			 "(%s id %llu), size=%u\n", rw, +			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, +			 name->str, dev->devid, bio->bi_size); +		rcu_read_unlock(); +	} +#endif +	bio->bi_bdev = dev->bdev; +	if (async) +		schedule_bio(root, dev, rw, bio); +	else +		btrfsic_submit_bio(rw, bio); +} + +static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, +			      struct bio *first_bio, struct btrfs_device *dev, +			      int dev_nr, int rw, int async) +{ +	struct bio_vec *bvec = first_bio->bi_io_vec; +	struct bio *bio; +	int nr_vecs = bio_get_nr_vecs(dev->bdev); +	u64 physical = bbio->stripes[dev_nr].physical; + +again: +	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); +	if (!bio) +		return -ENOMEM; + +	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { +		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, +				 bvec->bv_offset) < bvec->bv_len) { +			u64 len = bio->bi_size; + +			atomic_inc(&bbio->stripes_pending); +			submit_stripe_bio(root, bbio, bio, physical, dev_nr, +					  rw, async); +			physical += len; +			goto again; +		} +		bvec++; +	} + +	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); +	return 0; +} + +static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +{ +	atomic_inc(&bbio->error); +	if (atomic_dec_and_test(&bbio->stripes_pending)) { +		bio->bi_private = bbio->private; +		bio->bi_end_io = bbio->end_io; +		bio->bi_bdev = (struct block_device *) +			(unsigned long)bbio->mirror_num; +		bio->bi_sector = logical >> 9; +		kfree(bbio); +		bio_endio(bio, -EIO); +	} +} +  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  		  int mirror_num, int async_submit)  { -	struct btrfs_mapping_tree *map_tree;  	struct btrfs_device *dev;  	struct bio *first_bio = bio;  	u64 logical = (u64)bio->bi_sector << 9; @@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	struct btrfs_bio *bbio = NULL;  	length = bio->bi_size; -	map_tree = &root->fs_info->mapping_tree;  	map_length = length; -	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, +	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,  			      mirror_num); -	if (ret) /* -ENOMEM */ +	if (ret)  		return ret;  	total_devs = bbio->num_stripes; @@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,  	atomic_set(&bbio->stripes_pending, bbio->num_stripes);  	while (dev_nr < total_devs) { +		dev = bbio->stripes[dev_nr].dev; +		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { +			bbio_error(bbio, first_bio, logical); +			dev_nr++; +			continue; +		} + +		/* +		 * Check and see if we're ok with this bio based on it's size +		 * and offset with the given device. +		 */ +		if (!bio_size_ok(dev->bdev, first_bio, +				 bbio->stripes[dev_nr].physical >> 9)) { +			ret = breakup_stripe_bio(root, bbio, first_bio, dev, +						 dev_nr, rw, async_submit); +			BUG_ON(ret); +			dev_nr++; +			continue; +		} +  		if (dev_nr < total_devs - 1) {  			bio = bio_clone(first_bio, GFP_NOFS);  			BUG_ON(!bio); /* -ENOMEM */  		} else {  			bio = first_bio;  		} -		bio->bi_private = bbio; -		bio->bi_private = merge_stripe_index_into_bio_private( -				bio->bi_private, (unsigned int)dev_nr); -		bio->bi_end_io = btrfs_end_bio; -		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; -		dev = bbio->stripes[dev_nr].dev; -		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { -#ifdef DEBUG -			struct rcu_string *name; - -			rcu_read_lock(); -			name = rcu_dereference(dev->name); -			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " -				 "(%s id %llu), size=%u\n", rw, -				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, -				 name->str, dev->devid, bio->bi_size); -			rcu_read_unlock(); -#endif -			bio->bi_bdev = dev->bdev; -			if (async_submit) -				schedule_bio(root, dev, rw, bio); -			else -				btrfsic_submit_bio(rw, bio); -		} else { -			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; -			bio->bi_sector = logical >> 9; -			bio_endio(bio, -EIO); -		} + +		submit_stripe_bio(root, bbio, bio, +				  bbio->stripes[dev_nr].physical, dev_nr, rw, +				  async_submit);  		dev_nr++;  	}  	return 0;  } -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,  				       u8 *uuid, u8 *fsid)  {  	struct btrfs_device *device;  	struct btrfs_fs_devices *cur_devices; -	cur_devices = root->fs_info->fs_devices; +	cur_devices = fs_info->fs_devices;  	while (cur_devices) {  		if (!fsid ||  		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { @@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  	em->bdev = (struct block_device *)map;  	em->start = logical;  	em->len = length; +	em->orig_start = 0;  	em->block_start = 0;  	em->block_len = em->len; @@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,  		read_extent_buffer(leaf, uuid, (unsigned long)  				   btrfs_stripe_dev_uuid_nr(chunk, i),  				   BTRFS_UUID_SIZE); -		map->stripes[i].dev = btrfs_find_device(root, devid, uuid, -							NULL); +		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, +							uuid, NULL);  		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {  			kfree(map);  			free_extent_map(em); @@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,  	device->io_align = btrfs_device_io_align(leaf, dev_item);  	device->io_width = btrfs_device_io_width(leaf, dev_item);  	device->sector_size = btrfs_device_sector_size(leaf, dev_item); +	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); +	device->is_tgtdev_for_dev_replace = 0;  	ptr = (unsigned long)btrfs_device_uuid(dev_item);  	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); @@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,  			return ret;  	} -	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); +	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);  	if (!device || !device->bdev) {  		if (!btrfs_test_opt(root, DEGRADED))  			return -EIO; @@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,  	fill_device_from_item(leaf, dev_item, device);  	device->dev_root = root->fs_info->dev_root;  	device->in_fs_metadata = 1; -	if (device->writeable) { +	if (device->writeable && !device->is_tgtdev_for_dev_replace) {  		device->fs_devices->total_rw_bytes += device->total_bytes;  		spin_lock(&root->fs_info->free_chunk_lock);  		root->fs_info->free_chunk_space += device->total_bytes - @@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,  	int i;  	mutex_lock(&fs_devices->device_list_mutex); -	dev = btrfs_find_device(root, stats->devid, NULL, NULL); +	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);  	mutex_unlock(&fs_devices->device_list_mutex);  	if (!dev) { @@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,  		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;  	return 0;  } + +int btrfs_scratch_superblock(struct btrfs_device *device) +{ +	struct buffer_head *bh; +	struct btrfs_super_block *disk_super; + +	bh = btrfs_read_dev_super(device->bdev); +	if (!bh) +		return -EINVAL; +	disk_super = (struct btrfs_super_block *)bh->b_data; + +	memset(&disk_super->magic, 0, sizeof(disk_super->magic)); +	set_buffer_dirty(bh); +	sync_dirty_buffer(bh); +	brelse(bh); + +	return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 53c06af92e8d..d3c3939ac751 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -50,6 +50,7 @@ struct btrfs_device {  	int in_fs_metadata;  	int missing;  	int can_discard; +	int is_tgtdev_for_dev_replace;  	spinlock_t io_lock; @@ -88,7 +89,7 @@ struct btrfs_device {  	u8 uuid[BTRFS_UUID_SIZE];  	/* per-device scrub information */ -	struct scrub_dev *scrub_device; +	struct scrub_ctx *scrub_device;  	struct btrfs_work work;  	struct rcu_head rcu; @@ -179,6 +180,15 @@ struct btrfs_device_info {  	u64 total_avail;  }; +struct btrfs_raid_attr { +	int sub_stripes;	/* sub_stripes info for map */ +	int dev_stripes;	/* stripes per dev */ +	int devs_max;		/* max devs to use */ +	int devs_min;		/* min devs needed */ +	int devs_increment;	/* ndevs has to be a multiple of this */ +	int ncopies;		/* how many copies to data has */ +}; +  struct map_lookup {  	u64 type;  	int io_align; @@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,  			   struct btrfs_device *device,  			   u64 chunk_tree, u64 chunk_objectid,  			   u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,  		    u64 logical, u64 *length,  		    struct btrfs_bio **bbio_ret, int mirror_num);  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,  int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,  			  struct btrfs_fs_devices **fs_devices_ret);  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, +			       struct btrfs_fs_devices *fs_devices, int step); +int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, +					 char *device_path, +					 struct btrfs_device **device); +int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, +			      struct btrfs_device **device);  int btrfs_add_device(struct btrfs_trans_handle *trans,  		     struct btrfs_root *root,  		     struct btrfs_device *device);  int btrfs_rm_device(struct btrfs_root *root, char *device_path);  void btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);  int btrfs_grow_device(struct btrfs_trans_handle *trans,  		      struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, +struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,  				       u8 *uuid, u8 *fsid);  int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);  int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, +				  struct btrfs_device **device_out);  int btrfs_balance(struct btrfs_balance_control *bctl,  		  struct btrfs_ioctl_balance_args *bargs);  int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); @@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,  int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);  int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,  			struct btrfs_fs_info *fs_info); +void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, +				 struct btrfs_device *srcdev); +void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, +				      struct btrfs_device *tgtdev); +void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, +					      struct btrfs_device *tgtdev); +int btrfs_scratch_superblock(struct btrfs_device *device);  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,  				      int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3f4e2d69e83a..446a6848c554 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,  		 */  		if (!value)  			goto out; +	} else { +		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), +					name, name_len, 0); +		if (IS_ERR(di)) { +			ret = PTR_ERR(di); +			goto out; +		} +		if (!di && !value) +			goto out; +		btrfs_release_path(path);  	}  again: @@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,  	inode_inc_iversion(inode);  	inode->i_ctime = CURRENT_TIME; +	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);  	ret = btrfs_update_inode(trans, root, inode);  	BUG_ON(ret);  out: @@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)  		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);  		if (verify_dir_item(root, leaf, di)) -			continue; +			goto next;  		name_len = btrfs_dir_name_len(leaf, di);  		total_size += name_len + 1; |