diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/backref.c | 1 | ||||
| -rw-r--r-- | fs/btrfs/ctree.h | 5 | ||||
| -rw-r--r-- | fs/btrfs/delayed-ref.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 56 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.h | 2 | ||||
| -rw-r--r-- | fs/btrfs/extent-tree.c | 185 | ||||
| -rw-r--r-- | fs/btrfs/extent_io.h | 1 | ||||
| -rw-r--r-- | fs/btrfs/file.c | 28 | ||||
| -rw-r--r-- | fs/btrfs/inode-map.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 37 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/qgroup.c | 62 | ||||
| -rw-r--r-- | fs/btrfs/qgroup.h | 36 | ||||
| -rw-r--r-- | fs/btrfs/relocation.c | 126 | ||||
| -rw-r--r-- | fs/btrfs/root-tree.c | 27 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 16 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.c | 21 | ||||
| -rw-r--r-- | fs/btrfs/tree-log.h | 5 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 27 | 
20 files changed, 473 insertions, 181 deletions
| diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2b88439c2ee8..455a6b2fd539 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)  			list_del(&ref2->list);  			kmem_cache_free(btrfs_prelim_ref_cache, ref2); +			cond_resched();  		}  	} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2fe8f89091a3..eff3993c77b3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1028,6 +1028,7 @@ struct btrfs_fs_info {  	struct btrfs_workqueue *qgroup_rescan_workers;  	struct completion qgroup_rescan_completion;  	struct btrfs_work qgroup_rescan_work; +	bool qgroup_rescan_running;	/* protected by qgroup_rescan_lock */  	/* filesystem state */  	unsigned long fs_state; @@ -1079,6 +1080,8 @@ struct btrfs_fs_info {  	struct list_head pinned_chunks;  	int creating_free_space_tree; +	/* Used to record internally whether fs has been frozen */ +	int fs_frozen;  };  struct btrfs_subvolume_writers { @@ -2578,7 +2581,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  				   struct btrfs_root *root,  				   u64 root_objectid, u64 owner, u64 offset,  				   struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,  			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,  			 struct btrfs_key *ins, int is_data, int delalloc);  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d9ddcfc18c91..ac02e041464b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,  	struct btrfs_delayed_ref_head *existing;  	struct btrfs_delayed_ref_head *head_ref = NULL;  	struct btrfs_delayed_ref_root *delayed_refs; -	struct btrfs_qgroup_extent_record *qexisting;  	int count_mod = 1;  	int must_insert_reserved = 0; @@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,  		qrecord->num_bytes = num_bytes;  		qrecord->old_roots = NULL; -		qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, -							     delayed_refs, -							     qrecord); -		if (qexisting) +		if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, +					delayed_refs, qrecord))  			kfree(qrecord);  	} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 59febfb8d04a..54bc8c7c6bcd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,  	u32 nritems = btrfs_header_nritems(leaf);  	int slot; -	if (nritems == 0) +	if (nritems == 0) { +		struct btrfs_root *check_root; + +		key.objectid = btrfs_header_owner(leaf); +		key.type = BTRFS_ROOT_ITEM_KEY; +		key.offset = (u64)-1; + +		check_root = btrfs_get_fs_root(root->fs_info, &key, false); +		/* +		 * The only reason we also check NULL here is that during +		 * open_ctree() some roots has not yet been set up. +		 */ +		if (!IS_ERR_OR_NULL(check_root)) { +			/* if leaf is the root, then it's fine */ +			if (leaf->start != +			    btrfs_root_bytenr(&check_root->root_item)) { +				CORRUPT("non-root leaf's nritems is 0", +					leaf, root, 0); +				return -EIO; +			} +		}  		return 0; +	}  	/* Check the 0 item */  	if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,  	return 0;  } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ +	unsigned long nr = btrfs_header_nritems(node); + +	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { +		btrfs_crit(root->fs_info, +			   "corrupt node: block %llu root %llu nritems %lu", +			   node->start, root->objectid, nr); +		return -EIO; +	} +	return 0; +} +  static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  				      u64 phy_offset, struct page *page,  				      u64 start, u64 end, int mirror) @@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,  		ret = -EIO;  	} +	if (found_level > 0 && check_node(root, eb)) +		ret = -EIO; +  	if (!ret)  		set_extent_buffer_uptodate(eb);  err: @@ -1618,8 +1655,8 @@ fail:  	return ret;  } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, -					       u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, +					u64 root_id)  {  	struct btrfs_root *root; @@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)  	fs_info->quota_enabled = 0;  	fs_info->pending_quota_state = 0;  	fs_info->qgroup_ulist = NULL; +	fs_info->qgroup_rescan_running = false;  	mutex_init(&fs_info->qgroup_rescan_lock);  } @@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,  	atomic_set(&fs_info->qgroup_op_seq, 0);  	atomic_set(&fs_info->reada_works_cnt, 0);  	atomic64_set(&fs_info->tree_mod_seq, 0); +	fs_info->fs_frozen = 0;  	fs_info->sb = sb;  	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;  	fs_info->metadata_ratio = 0; @@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,  	if (btrfs_root_refs(&root->root_item) == 0)  		synchronize_srcu(&fs_info->subvol_srcu); -	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) +	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		btrfs_free_log(NULL, root); +		if (root->reloc_root) { +			free_extent_buffer(root->reloc_root->node); +			free_extent_buffer(root->reloc_root->commit_root); +			btrfs_put_fs_root(root->reloc_root); +			root->reloc_root = NULL; +		} +	}  	if (root->free_ino_pinned)  		__btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)  	smp_mb();  	/* wait for the qgroup rescan worker to stop */ -	btrfs_qgroup_wait_for_completion(fs_info); +	btrfs_qgroup_wait_for_completion(fs_info, false);  	/* wait for the uuid_scan task to finish */  	down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b3207a0e09f7..f19a982f5a4f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,  struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,  				      struct btrfs_key *location);  int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, +					u64 root_id);  int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,  			 struct btrfs_root *root);  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61b494e8e604..0450dc410533 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum {  	CHUNK_ALLOC_FORCE = 2,  }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - *   ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - *   bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { -	RESERVE_FREE = 0, -	RESERVE_ALLOC = 1, -	RESERVE_ALLOC_NO_ACCOUNT = 2, -}; -  static int update_block_group(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root, u64 bytenr,  			      u64 num_bytes, int alloc); @@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,  			 struct btrfs_key *key);  static void dump_space_info(struct btrfs_space_info *info, u64 bytes,  			    int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				       u64 num_bytes, int reserve, -				       int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, +				    u64 ram_bytes, u64 num_bytes, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, +				     u64 num_bytes, int delalloc);  static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,  			       u64 num_bytes);  int btrfs_pin_extent(struct btrfs_root *root, @@ -3501,7 +3487,6 @@ again:  		dcs = BTRFS_DC_SETUP;  	else if (ret == -ENOSPC)  		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); -	btrfs_free_reserved_data_space(inode, 0, num_pages);  out_put:  	iput(inode); @@ -4472,6 +4457,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,  	}  } +/* + * If force is CHUNK_ALLOC_FORCE: + *    - return 1 if it successfully allocates a chunk, + *    - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + *    - return 0 if it doesn't need to allocate a new chunk, + *    - return 1 if it successfully allocates a chunk, + *    - return errors including -ENOSPC otherwise. + */  static int do_chunk_alloc(struct btrfs_trans_handle *trans,  			  struct btrfs_root *extent_root, u64 flags, int force)  { @@ -4882,7 +4876,7 @@ static int flush_space(struct btrfs_root *root,  				     btrfs_get_alloc_profile(root, 0),  				     CHUNK_ALLOC_NO_FORCE);  		btrfs_end_transaction(trans, root); -		if (ret == -ENOSPC) +		if (ret > 0 || ret == -ENOSPC)  			ret = 0;  		break;  	case COMMIT_TRANS: @@ -6497,19 +6491,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)  }  /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters   * @cache:	The cache we are manipulating + * @ram_bytes:  The number of bytes of file content, and will be same to + *              @num_bytes except for the compress path.   * @num_bytes:	The number of bytes in question - * @reserve:	One of the reservation enums   * @delalloc:   The blocks are allocated for the delalloc write   * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk.  For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper   * ENOSPC accounting.  For data we handle the reservation through clearing the   * delalloc bits in the io_tree.  We have to do this since we could end up   * allocating less disk space for the amount of data we have reserved in the @@ -6519,44 +6509,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)   * make the reservation and return -EAGAIN, otherwise this function always   * succeeds.   */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, -				       u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, +				    u64 ram_bytes, u64 num_bytes, int delalloc)  {  	struct btrfs_space_info *space_info = cache->space_info;  	int ret = 0;  	spin_lock(&space_info->lock);  	spin_lock(&cache->lock); -	if (reserve != RESERVE_FREE) { -		if (cache->ro) { -			ret = -EAGAIN; -		} else { -			cache->reserved += num_bytes; -			space_info->bytes_reserved += num_bytes; -			if (reserve == RESERVE_ALLOC) { -				trace_btrfs_space_reservation(cache->fs_info, -						"space_info", space_info->flags, -						num_bytes, 0); -				space_info->bytes_may_use -= num_bytes; -			} - -			if (delalloc) -				cache->delalloc_bytes += num_bytes; -		} +	if (cache->ro) { +		ret = -EAGAIN;  	} else { -		if (cache->ro) -			space_info->bytes_readonly += num_bytes; -		cache->reserved -= num_bytes; -		space_info->bytes_reserved -= num_bytes; +		cache->reserved += num_bytes; +		space_info->bytes_reserved += num_bytes; +		trace_btrfs_space_reservation(cache->fs_info, +				"space_info", space_info->flags, +				ram_bytes, 0); +		space_info->bytes_may_use -= ram_bytes;  		if (delalloc) -			cache->delalloc_bytes -= num_bytes; +			cache->delalloc_bytes += num_bytes;  	}  	spin_unlock(&cache->lock);  	spin_unlock(&space_info->lock);  	return ret;  } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache:      The cache we are manipulating + * @num_bytes:  The number of bytes in question + * @delalloc:   The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk.  For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, +				     u64 num_bytes, int delalloc) +{ +	struct btrfs_space_info *space_info = cache->space_info; +	int ret = 0; + +	spin_lock(&space_info->lock); +	spin_lock(&cache->lock); +	if (cache->ro) +		space_info->bytes_readonly += num_bytes; +	cache->reserved -= num_bytes; +	space_info->bytes_reserved -= num_bytes; + +	if (delalloc) +		cache->delalloc_bytes -= num_bytes; +	spin_unlock(&cache->lock); +	spin_unlock(&space_info->lock); +	return ret; +}  void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,  				struct btrfs_root *root)  { @@ -7191,7 +7200,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,  		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));  		btrfs_add_free_space(cache, buf->start, buf->len); -		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); +		btrfs_free_reserved_bytes(cache, buf->len, 0);  		btrfs_put_block_group(cache);  		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);  		pin = 0; @@ -7416,9 +7425,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,   * the free space extent currently.   */  static noinline int find_free_extent(struct btrfs_root *orig_root, -				     u64 num_bytes, u64 empty_size, -				     u64 hint_byte, struct btrfs_key *ins, -				     u64 flags, int delalloc) +				u64 ram_bytes, u64 num_bytes, u64 empty_size, +				u64 hint_byte, struct btrfs_key *ins, +				u64 flags, int delalloc)  {  	int ret = 0;  	struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7430,8 +7439,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,  	struct btrfs_space_info *space_info;  	int loop = 0;  	int index = __get_raid_index(flags); -	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? -		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;  	bool failed_cluster_refill = false;  	bool failed_alloc = false;  	bool use_cluster = true; @@ -7763,8 +7770,8 @@ checks:  					     search_start - offset);  		BUG_ON(offset > search_start); -		ret = btrfs_update_reserved_bytes(block_group, num_bytes, -						  alloc_type, delalloc); +		ret = btrfs_add_reserved_bytes(block_group, ram_bytes, +				num_bytes, delalloc);  		if (ret == -EAGAIN) {  			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop; @@ -7936,7 +7943,7 @@ again:  	up_read(&info->groups_sem);  } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,  			 u64 num_bytes, u64 min_alloc_size,  			 u64 empty_size, u64 hint_byte,  			 struct btrfs_key *ins, int is_data, int delalloc) @@ -7948,8 +7955,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,  	flags = btrfs_get_alloc_profile(root, is_data);  again:  	WARN_ON(num_bytes < root->sectorsize); -	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, -			       flags, delalloc); +	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, +			       hint_byte, ins, flags, delalloc);  	if (!ret && !is_data) {  		btrfs_dec_block_group_reservations(root->fs_info,  						   ins->objectid); @@ -7958,6 +7965,7 @@ again:  			num_bytes = min(num_bytes >> 1, ins->offset);  			num_bytes = round_down(num_bytes, root->sectorsize);  			num_bytes = max(num_bytes, min_alloc_size); +			ram_bytes = num_bytes;  			if (num_bytes == min_alloc_size)  				final_tried = true;  			goto again; @@ -7995,7 +8003,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,  		if (btrfs_test_opt(root->fs_info, DISCARD))  			ret = btrfs_discard_extent(root, start, len, NULL);  		btrfs_add_free_space(cache, start, len); -		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); +		btrfs_free_reserved_bytes(cache, len, delalloc);  		trace_btrfs_reserved_extent_free(root, start, len);  	} @@ -8223,8 +8231,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  	if (!block_group)  		return -EINVAL; -	ret = btrfs_update_reserved_bytes(block_group, ins->offset, -					  RESERVE_ALLOC_NO_ACCOUNT, 0); +	ret = btrfs_add_reserved_bytes(block_group, ins->offset, +				       ins->offset, 0);  	BUG_ON(ret); /* logic error */  	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,  					 0, owner, offset, ins, 1); @@ -8368,7 +8376,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,  	if (IS_ERR(block_rsv))  		return ERR_CAST(block_rsv); -	ret = btrfs_reserve_extent(root, blocksize, blocksize, +	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,  				   empty_size, hint, &ins, 0, 0);  	if (ret)  		goto out_unuse; @@ -8521,35 +8529,6 @@ reada:  	wc->reada_slot = slot;  } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, -				     struct btrfs_root *root, u64 bytenr, -				     u64 num_bytes) -{ -	struct btrfs_qgroup_extent_record *qrecord; -	struct btrfs_delayed_ref_root *delayed_refs; - -	qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); -	if (!qrecord) -		return -ENOMEM; - -	qrecord->bytenr = bytenr; -	qrecord->num_bytes = num_bytes; -	qrecord->old_roots = NULL; - -	delayed_refs = &trans->transaction->delayed_refs; -	spin_lock(&delayed_refs->lock); -	if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, -					     delayed_refs, qrecord)) -		kfree(qrecord); -	spin_unlock(&delayed_refs->lock); - -	return 0; -} -  static int account_leaf_items(struct btrfs_trans_handle *trans,  			      struct btrfs_root *root,  			      struct extent_buffer *eb) @@ -8583,7 +8562,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,  		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); -		ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); +		ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, +				bytenr, num_bytes, GFP_NOFS);  		if (ret)  			return ret;  	} @@ -8732,8 +8712,9 @@ walk_down:  			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);  			path->locks[level] = BTRFS_READ_LOCK_BLOCKING; -			ret = record_one_subtree_extent(trans, root, child_bytenr, -							root->nodesize); +			ret = btrfs_qgroup_insert_dirty_extent(trans, +					root->fs_info, child_bytenr, +					root->nodesize, GFP_NOFS);  			if (ret)  				goto out;  		} @@ -9906,6 +9887,7 @@ static int find_first_block_group(struct btrfs_root *root,  			} else {  				ret = 0;  			} +			free_extent_map(em);  			goto out;  		}  		path->slots[0]++; @@ -9942,6 +9924,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)  		block_group->iref = 0;  		block_group->inode = NULL;  		spin_unlock(&block_group->lock); +		ASSERT(block_group->io_ctl.inode == NULL);  		iput(inode);  		last = block_group->key.objectid + block_group->key.offset;  		btrfs_put_block_group(block_group); @@ -9999,6 +9982,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)  			free_excluded_extents(info->extent_root, block_group);  		btrfs_remove_free_space_cache(block_group); +		ASSERT(list_empty(&block_group->dirty_list)); +		ASSERT(list_empty(&block_group->io_list)); +		ASSERT(list_empty(&block_group->bg_list)); +		ASSERT(atomic_read(&block_group->count) == 1);  		btrfs_put_block_group(block_group);  		spin_lock(&info->block_group_cache_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc2729a7612d..28cd88fccc7e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@  #define EXTENT_DAMAGED		(1U << 14)  #define EXTENT_NORESERVE	(1U << 15)  #define EXTENT_QGROUP_RESERVED	(1U << 16) +#define EXTENT_CLEAR_DATA_RESV	(1U << 17)  #define EXTENT_IOBITS		(EXTENT_LOCKED | EXTENT_WRITEBACK)  #define EXTENT_CTLBITS		(EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5842423f8f47..fea31a4a6e36 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)  	}  	trans->sync = true; -	btrfs_init_log_ctx(&ctx); +	btrfs_init_log_ctx(&ctx, inode);  	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);  	if (ret < 0) { @@ -2675,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,  	alloc_start = round_down(offset, blocksize);  	alloc_end = round_up(offset + len, blocksize); +	cur_offset = alloc_start;  	/* Make sure we aren't being give some crap mode */  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2767,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,  	/* First, check if we exceed the qgroup limit */  	INIT_LIST_HEAD(&reserve_list); -	cur_offset = alloc_start;  	while (1) {  		em = btrfs_get_extent(inode, NULL, 0, cur_offset,  				      alloc_end - cur_offset, 0); @@ -2794,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,  					last_byte - cur_offset);  			if (ret < 0)  				break; +		} else { +			/* +			 * Do not need to reserve unwritten extent for this +			 * range, free reserved data space first, otherwise +			 * it'll result in false ENOSPC error. +			 */ +			btrfs_free_reserved_data_space(inode, cur_offset, +				last_byte - cur_offset);  		}  		free_extent_map(em);  		cur_offset = last_byte; @@ -2811,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,  					range->start,  					range->len, 1 << inode->i_blkbits,  					offset + len, &alloc_hint); +		else +			btrfs_free_reserved_data_space(inode, range->start, +						       range->len);  		list_del(&range->list);  		kfree(range);  	} @@ -2845,18 +2856,11 @@ out_unlock:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,  			     &cached_state, GFP_KERNEL);  out: -	/* -	 * As we waited the extent range, the data_rsv_map must be empty -	 * in the range, as written data range will be released from it. -	 * And for prealloacted extent, it will also be released when -	 * its metadata is written. -	 * So this is completely used as cleanup. -	 */ -	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);  	inode_unlock(inode);  	/* Let go of our reservation. */ -	btrfs_free_reserved_data_space(inode, alloc_start, -				       alloc_end - alloc_start); +	if (ret != 0) +		btrfs_free_reserved_data_space(inode, alloc_start, +				       alloc_end - cur_offset);  	return ret;  } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index aa6fabaee72e..359ee861b5a4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,10 +495,9 @@ again:  	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,  					      prealloc, prealloc, &alloc_hint);  	if (ret) { -		btrfs_delalloc_release_space(inode, 0, prealloc); +		btrfs_delalloc_release_metadata(inode, prealloc);  		goto out_put;  	} -	btrfs_free_reserved_data_space(inode, 0, prealloc);  	ret = btrfs_write_out_ino_cache(root, trans, path, inode);  out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 08dfc57e2270..e6811c42e41e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont:  						     PAGE_SET_WRITEBACK |  						     page_error_op |  						     PAGE_END_WRITEBACK); +			btrfs_free_reserved_data_space_noquota(inode, start, +						end - start + 1);  			goto free_pages_out;  		}  	} @@ -742,7 +744,7 @@ retry:  		lock_extent(io_tree, async_extent->start,  			    async_extent->start + async_extent->ram_size - 1); -		ret = btrfs_reserve_extent(root, +		ret = btrfs_reserve_extent(root, async_extent->ram_size,  					   async_extent->compressed_size,  					   async_extent->compressed_size,  					   0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,  				     EXTENT_DEFRAG, PAGE_UNLOCK |  				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |  				     PAGE_END_WRITEBACK); - +			btrfs_free_reserved_data_space_noquota(inode, start, +						end - start + 1);  			*nr_written = *nr_written +  			     (end - start + PAGE_SIZE) / PAGE_SIZE;  			*page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,  		unsigned long op;  		cur_alloc_size = disk_num_bytes; -		ret = btrfs_reserve_extent(root, cur_alloc_size, +		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,  					   root->sectorsize, 0, alloc_hint,  					   &ins, 1, 1);  		if (ret < 0) @@ -1489,8 +1492,10 @@ out_check:  		extent_clear_unlock_delalloc(inode, cur_offset,  					     cur_offset + num_bytes - 1,  					     locked_page, EXTENT_LOCKED | -					     EXTENT_DELALLOC, PAGE_UNLOCK | -					     PAGE_SET_PRIVATE2); +					     EXTENT_DELALLOC | +					     EXTENT_CLEAR_DATA_RESV, +					     PAGE_UNLOCK | PAGE_SET_PRIVATE2); +  		if (!nolock && nocow)  			btrfs_end_write_no_snapshoting(root);  		cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,  			return;  		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID -		    && do_list && !(state->state & EXTENT_NORESERVE)) +		    && do_list && !(state->state & EXTENT_NORESERVE) +		    && (*bits & (EXTENT_DO_ACCOUNTING | +		    EXTENT_CLEAR_DATA_RESV)))  			btrfs_free_reserved_data_space_noquota(inode,  					state->start, len); @@ -7251,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,  	int ret;  	alloc_hint = get_extent_allocation_hint(inode, start, len); -	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, +	ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,  				   alloc_hint, &ins, 1, 1);  	if (ret)  		return ERR_PTR(ret); @@ -7751,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  				ret = PTR_ERR(em2);  				goto unlock_err;  			} +			/* +			 * For inode marked NODATACOW or extent marked PREALLOC, +			 * use the existing or preallocated extent, so does not +			 * need to adjust btrfs_space_info's bytes_may_use. +			 */ +			btrfs_free_reserved_data_space_noquota(inode, +					start, len);  			goto unlock;  		}  	} @@ -7785,7 +7799,6 @@ unlock:  			i_size_write(inode, start + len);  		adjust_dio_outstanding_extents(inode, dio_data, len); -		btrfs_free_reserved_data_space(inode, start, len);  		WARN_ON(dio_data->reserve < len);  		dio_data->reserve -= len;  		dio_data->unsubmitted_oe_range_end = start + len; @@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  	u64 last_alloc = (u64)-1;  	int ret = 0;  	bool own_trans = true; +	u64 end = start + num_bytes - 1;  	if (trans)  		own_trans = false; @@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		 * sized chunks.  		 */  		cur_bytes = min(cur_bytes, last_alloc); -		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, -					   *alloc_hint, &ins, 1, 0); +		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, +				min_size, 0, *alloc_hint, &ins, 1, 0);  		if (ret) {  			if (own_trans)  				btrfs_end_transaction(trans, root); @@ -10414,6 +10428,9 @@ next:  		if (own_trans)  			btrfs_end_transaction(trans, root);  	} +	if (cur_offset < end) +		btrfs_free_reserved_data_space(inode, cur_offset, +			end - cur_offset + 1);  	return ret;  } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14ed1e9e6bc8..b2a2da5893af 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5084,7 +5084,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	return btrfs_qgroup_wait_for_completion(root->fs_info); +	return btrfs_qgroup_wait_for_completion(root->fs_info, true);  }  static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93ee1c18ef9d..8db2e29fdcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,  		goto out;  	fs_info->quota_enabled = 0;  	fs_info->pending_quota_state = 0; -	btrfs_qgroup_wait_for_completion(fs_info); +	btrfs_qgroup_wait_for_completion(fs_info, false);  	spin_lock(&fs_info->qgroup_lock);  	quota_root = fs_info->quota_root;  	fs_info->quota_root = NULL; @@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,  	return ret;  } -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, -				 struct btrfs_delayed_ref_root *delayed_refs, -				 struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, +				struct btrfs_delayed_ref_root *delayed_refs, +				struct btrfs_qgroup_extent_record *record)  {  	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;  	struct rb_node *parent_node = NULL; @@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,  		else if (bytenr > entry->bytenr)  			p = &(*p)->rb_right;  		else -			return entry; +			return 1;  	}  	rb_link_node(&record->node, parent_node, p);  	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); -	return NULL; +	return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +		struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, +		gfp_t gfp_flag) +{ +	struct btrfs_qgroup_extent_record *record; +	struct btrfs_delayed_ref_root *delayed_refs; +	int ret; + +	if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) +		return 0; +	if (WARN_ON(trans == NULL)) +		return -EINVAL; +	record = kmalloc(sizeof(*record), gfp_flag); +	if (!record) +		return -ENOMEM; + +	delayed_refs = &trans->transaction->delayed_refs; +	record->bytenr = bytenr; +	record->num_bytes = num_bytes; +	record->old_roots = NULL; + +	spin_lock(&delayed_refs->lock); +	ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, +						      record); +	spin_unlock(&delayed_refs->lock); +	if (ret > 0) +		kfree(record); +	return 0;  }  #define UPDATE_NEW	0 @@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)  	int err = -ENOMEM;  	int ret = 0; +	mutex_lock(&fs_info->qgroup_rescan_lock); +	fs_info->qgroup_rescan_running = true; +	mutex_unlock(&fs_info->qgroup_rescan_lock); +  	path = btrfs_alloc_path();  	if (!path)  		goto out; @@ -2369,6 +2402,9 @@ out:  	}  done: +	mutex_lock(&fs_info->qgroup_rescan_lock); +	fs_info->qgroup_rescan_running = false; +	mutex_unlock(&fs_info->qgroup_rescan_lock);  	complete_all(&fs_info->qgroup_rescan_completion);  } @@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)  	return 0;  } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, +				     bool interruptible)  {  	int running;  	int ret = 0;  	mutex_lock(&fs_info->qgroup_rescan_lock);  	spin_lock(&fs_info->qgroup_lock); -	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; +	running = fs_info->qgroup_rescan_running;  	spin_unlock(&fs_info->qgroup_lock);  	mutex_unlock(&fs_info->qgroup_rescan_lock); -	if (running) +	if (!running) +		return 0; + +	if (interruptible)  		ret = wait_for_completion_interruptible(  					&fs_info->qgroup_rescan_completion); +	else +		wait_for_completion(&fs_info->qgroup_rescan_completion);  	return ret;  } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 710887c06aaf..1bc64c864b62 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,  			struct btrfs_fs_info *fs_info);  int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);  void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, +				     bool interruptible);  int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,  			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);  int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, @@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);  struct btrfs_delayed_extent_op;  int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,  					 struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, -				 struct btrfs_delayed_ref_root *delayed_refs, -				 struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( +		struct btrfs_fs_info *fs_info, +		struct btrfs_delayed_ref_root *delayed_refs, +		struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +		struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, +		gfp_t gfp_flag); +  int  btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,  			    struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b26a5aea41b4..8a2c2a07987b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@  #include "async-thread.h"  #include "free-space-cache.h"  #include "inode-map.h" +#include "qgroup.h"  /*   * backref_node, mapping_node and tree_block start with this @@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,  	u64 num_bytes;  	int nr = 0;  	int ret = 0; +	u64 prealloc_start = cluster->start - offset; +	u64 prealloc_end = cluster->end - offset; +	u64 cur_offset;  	BUG_ON(cluster->start != cluster->boundary[0]);  	inode_lock(inode); -	ret = btrfs_check_data_free_space(inode, cluster->start, -					  cluster->end + 1 - cluster->start); +	ret = btrfs_check_data_free_space(inode, prealloc_start, +					  prealloc_end + 1 - prealloc_start);  	if (ret)  		goto out; +	cur_offset = prealloc_start;  	while (nr < cluster->nr) {  		start = cluster->boundary[nr] - offset;  		if (nr + 1 < cluster->nr) @@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,  		lock_extent(&BTRFS_I(inode)->io_tree, start, end);  		num_bytes = end + 1 - start; +		if (cur_offset < start) +			btrfs_free_reserved_data_space(inode, cur_offset, +					start - cur_offset);  		ret = btrfs_prealloc_file_range(inode, 0, start,  						num_bytes, num_bytes,  						end + 1, &alloc_hint); +		cur_offset = end + 1;  		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);  		if (ret)  			break;  		nr++;  	} -	btrfs_free_reserved_data_space(inode, cluster->start, -				       cluster->end + 1 - cluster->start); +	if (cur_offset < prealloc_end) +		btrfs_free_reserved_data_space(inode, cur_offset, +				       prealloc_end + 1 - cur_offset);  out:  	inode_unlock(inode);  	return ret; @@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)  	return 0;  } +/* + * Qgroup fixer for data chunk relocation. + * The data relocation is done in the following steps + * 1) Copy data extents into data reloc tree + * 2) Create tree reloc tree(special snapshot) for related subvolumes + * 3) Modify file extents in tree reloc tree + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks + * + * The problem is, data and tree reloc tree are not accounted to qgroup, + * and 4) will only info qgroup to track tree blocks change, not file extents + * in the tree blocks. + * + * The good news is, related data extents are all in data reloc tree, so we + * only need to info qgroup to track all file extents in data reloc tree + * before commit trans. + */ +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, +					     struct reloc_control *rc) +{ +	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; +	struct inode *inode = rc->data_inode; +	struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; +	struct btrfs_path *path; +	struct btrfs_key key; +	int ret = 0; + +	if (!fs_info->quota_enabled) +		return 0; + +	/* +	 * Only for stage where we update data pointers the qgroup fix is +	 * valid. +	 * For MOVING_DATA stage, we will miss the timing of swapping tree +	 * blocks, and won't fix it. +	 */ +	if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) +		return 0; + +	path = btrfs_alloc_path(); +	if (!path) +		return -ENOMEM; +	key.objectid = btrfs_ino(inode); +	key.type = BTRFS_EXTENT_DATA_KEY; +	key.offset = 0; + +	ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); +	if (ret < 0) +		goto out; + +	lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); +	while (1) { +		struct btrfs_file_extent_item *fi; + +		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); +		if (key.objectid > btrfs_ino(inode)) +			break; +		if (key.type != BTRFS_EXTENT_DATA_KEY) +			goto next; +		fi = btrfs_item_ptr(path->nodes[0], path->slots[0], +				    struct btrfs_file_extent_item); +		if (btrfs_file_extent_type(path->nodes[0], fi) != +				BTRFS_FILE_EXTENT_REG) +			goto next; +		ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, +			btrfs_file_extent_disk_bytenr(path->nodes[0], fi), +			btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), +			GFP_NOFS); +		if (ret < 0) +			break; +next: +		ret = btrfs_next_item(data_reloc_root, path); +		if (ret < 0) +			break; +		if (ret > 0) { +			ret = 0; +			break; +		} +	} +	unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); +out: +	btrfs_free_path(path); +	return ret; +} +  static noinline_for_stack int relocate_block_group(struct reloc_control *rc)  {  	struct rb_root blocks = RB_ROOT; @@ -4102,10 +4196,16 @@ restart:  	/* get rid of pinned extents */  	trans = btrfs_join_transaction(rc->extent_root); -	if (IS_ERR(trans)) +	if (IS_ERR(trans)) {  		err = PTR_ERR(trans); -	else -		btrfs_commit_transaction(trans, rc->extent_root); +		goto out_free; +	} +	err = qgroup_fix_relocated_data_extents(trans, rc); +	if (err < 0) { +		btrfs_abort_transaction(trans, err); +		goto out_free; +	} +	btrfs_commit_transaction(trans, rc->extent_root);  out_free:  	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);  	btrfs_free_path(path); @@ -4468,10 +4568,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)  	unset_reloc_control(rc);  	trans = btrfs_join_transaction(rc->extent_root); -	if (IS_ERR(trans)) +	if (IS_ERR(trans)) {  		err = PTR_ERR(trans); -	else -		err = btrfs_commit_transaction(trans, rc->extent_root); +		goto out_free; +	} +	err = qgroup_fix_relocated_data_extents(trans, rc); +	if (err < 0) { +		btrfs_abort_transaction(trans, err); +		goto out_free; +	} +	err = btrfs_commit_transaction(trans, rc->extent_root);  out_free:  	kfree(rc);  out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7fd7e1830cfe..091296062456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)  		root_key.objectid = key.offset;  		key.offset++; +		/* +		 * The root might have been inserted already, as before we look +		 * for orphan roots, log replay might have happened, which +		 * triggers a transaction commit and qgroup accounting, which +		 * in turn reads and inserts fs roots while doing backref +		 * walking. +		 */ +		root = btrfs_lookup_fs_root(tree_root->fs_info, +					    root_key.objectid); +		if (root) { +			WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, +					  &root->state)); +			if (btrfs_root_refs(&root->root_item) == 0) +				btrfs_add_dead_root(root); +			continue; +		} +  		root = btrfs_read_fs_root(tree_root, &root_key);  		err = PTR_ERR_OR_ZERO(root);  		if (err && err != -ENOENT) { @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)  		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);  		err = btrfs_insert_fs_root(root->fs_info, root); -		/* -		 * The root might have been inserted already, as before we look -		 * for orphan roots, log replay might have happened, which -		 * triggers a transaction commit and qgroup accounting, which -		 * in turn reads and inserts fs roots while doing backref -		 * walking. -		 */ -		if (err == -EEXIST) -			err = 0;  		if (err) { +			BUG_ON(err == -EEXIST);  			btrfs_free_fs_root(root);  			break;  		} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 864ce334f696..4071fe2bd098 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)  	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = btrfs_sb(sb)->tree_root; +	root->fs_info->fs_frozen = 1; +	/* +	 * We don't need a barrier here, we'll wait for any transaction that +	 * could be in progress on other threads (and do delayed iputs that +	 * we want to avoid on a frozen filesystem), or do the commit +	 * ourselves. +	 */  	trans = btrfs_attach_transaction_barrier(root);  	if (IS_ERR(trans)) {  		/* no transaction, don't bother */ @@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)  	return btrfs_commit_transaction(trans, root);  } +static int btrfs_unfreeze(struct super_block *sb) +{ +	struct btrfs_root *root = btrfs_sb(sb)->tree_root; + +	root->fs_info->fs_frozen = 0; +	return 0; +} +  static int btrfs_show_devname(struct seq_file *m, struct dentry *root)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {  	.statfs		= btrfs_statfs,  	.remount_fs	= btrfs_remount,  	.freeze_fs	= btrfs_freeze, +	.unfreeze_fs	= btrfs_unfreeze,  };  static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9cca0a721961..95d41919d034 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,  	kmem_cache_free(btrfs_trans_handle_cachep, trans); +	/* +	 * If fs has been frozen, we can not handle delayed iputs, otherwise +	 * it'll result in deadlock about SB_FREEZE_FS. +	 */  	if (current != root->fs_info->transaction_kthread && -	    current != root->fs_info->cleaner_kthread) +	    current != root->fs_info->cleaner_kthread && +	    !root->fs_info->fs_frozen)  		btrfs_run_delayed_iputs(root);  	return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fff3f3efa436..e935035ac034 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@  #include "backref.h"  #include "hash.h"  #include "compression.h" +#include "qgroup.h"  /* magic values for the inode_only field in btrfs_log_inode:   * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,  		ins.type = BTRFS_EXTENT_ITEM_KEY;  		offset = key->offset - btrfs_file_extent_offset(eb, item); +		/* +		 * Manually record dirty extent, as here we did a shallow +		 * file extent item copy and skip normal backref update, +		 * but modifying extent tree all by ourselves. +		 * So need to manually record dirty extent for qgroup, +		 * as the owner of the file extent changed from log tree +		 * (doesn't affect qgroup) to fs/file tree(affects qgroup) +		 */ +		ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, +				btrfs_file_extent_disk_bytenr(eb, item), +				btrfs_file_extent_disk_num_bytes(eb, item), +				GFP_NOFS); +		if (ret < 0) +			goto out; +  		if (ins.objectid > 0) {  			u64 csum_start;  			u64 csum_end; @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,  	 */  	mutex_unlock(&root->log_mutex); -	btrfs_init_log_ctx(&root_log_ctx); +	btrfs_init_log_ctx(&root_log_ctx, NULL);  	mutex_lock(&log_root_tree->log_mutex);  	atomic_inc(&log_root_tree->log_batch); @@ -4741,7 +4757,8 @@ again:  			if (ret < 0) {  				err = ret;  				goto out_unlock; -			} else if (ret > 0) { +			} else if (ret > 0 && ctx && +				   other_ino != btrfs_ino(ctx->inode)) {  				struct btrfs_key inode_key;  				struct inode *other_inode; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75d080d..ab858e31ccbc 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx {  	int log_transid;  	int io_err;  	bool log_new_dentries; +	struct inode *inode;  	struct list_head list;  }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, +				      struct inode *inode)  {  	ctx->log_ret = 0;  	ctx->log_transid = 0;  	ctx->io_err = 0;  	ctx->log_new_dentries = false; +	ctx->inode = inode;  	INIT_LIST_HEAD(&ctx->list);  } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 51f125508771..035efce603a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)  	struct btrfs_device *device;  	device = container_of(work, struct btrfs_device, rcu_work); - -	if (device->bdev) -		blkdev_put(device->bdev, device->mode); -  	rcu_string_free(device->name);  	kfree(device);  } @@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)  	schedule_work(&device->rcu_work);  } +static void btrfs_close_bdev(struct btrfs_device *device) +{ +	if (device->bdev && device->writeable) { +		sync_blockdev(device->bdev); +		invalidate_bdev(device->bdev); +	} + +	if (device->bdev) +		blkdev_put(device->bdev, device->mode); +} +  static void btrfs_close_one_device(struct btrfs_device *device)  {  	struct btrfs_fs_devices *fs_devices = device->fs_devices; @@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)  	if (device->missing)  		fs_devices->missing_devices--; -	if (device->bdev && device->writeable) { -		sync_blockdev(device->bdev); -		invalidate_bdev(device->bdev); -	} +	btrfs_close_bdev(device);  	new_device = btrfs_alloc_device(NULL, &device->devid,  					device->uuid); @@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)  		btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);  	} +	btrfs_close_bdev(device); +  	call_rcu(&device->rcu, free_device);  	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,  		/* zero out the old super if it is writable */  		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);  	} + +	btrfs_close_bdev(srcdev); +  	call_rcu(&srcdev->rcu, free_device);  	/* @@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,  	 * the device_list_mutex lock.  	 */  	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + +	btrfs_close_bdev(tgtdev);  	call_rcu(&tgtdev->rcu, free_device);  } |