diff options
Diffstat (limited to 'fs/btrfs')
| -rw-r--r-- | fs/btrfs/backref.c | 85 | ||||
| -rw-r--r-- | fs/btrfs/bio.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/block-group.c | 86 | ||||
| -rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 14 | ||||
| -rw-r--r-- | fs/btrfs/extent_map.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/free-space-cache.c | 8 | ||||
| -rw-r--r-- | fs/btrfs/fs.h | 7 | ||||
| -rw-r--r-- | fs/btrfs/inode.c | 7 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/qgroup.c | 11 | ||||
| -rw-r--r-- | fs/btrfs/space-info.c | 42 | ||||
| -rw-r--r-- | fs/btrfs/space-info.h | 2 | ||||
| -rw-r--r-- | fs/btrfs/super.c | 4 | ||||
| -rw-r--r-- | fs/btrfs/sysfs.c | 42 | ||||
| -rw-r--r-- | fs/btrfs/transaction.c | 15 | ||||
| -rw-r--r-- | fs/btrfs/volumes.c | 23 | ||||
| -rw-r--r-- | fs/btrfs/zoned.c | 45 | 
18 files changed, 275 insertions, 130 deletions
| diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 90e40d5ceccd..e54f0884802a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,  	level = -1;  	ULIST_ITER_INIT(&uiter);  	while (1) { -		bool is_shared; -		bool cached; +		const unsigned long prev_ref_count = ctx->refs.nnodes;  		walk_ctx.bytenr = bytenr;  		ret = find_parent_nodes(&walk_ctx, &shared); @@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,  		ret = 0;  		/* -		 * If our data extent was not directly shared (without multiple -		 * reference items), than it might have a single reference item -		 * with a count > 1 for the same offset, which means there are 2 -		 * (or more) file extent items that point to the data extent - -		 * this happens when a file extent item needs to be split and -		 * then one item gets moved to another leaf due to a b+tree leaf -		 * split when inserting some item. In this case the file extent -		 * items may be located in different leaves and therefore some -		 * of the leaves may be referenced through shared subtrees while -		 * others are not. Since our extent buffer cache only works for -		 * a single path (by far the most common case and simpler to -		 * deal with), we can not use it if we have multiple leaves -		 * (which implies multiple paths). +		 * More than one extent buffer (bytenr) may have been added to +		 * the ctx->refs ulist, in which case we have to check multiple +		 * tree paths in case the first one is not shared, so we can not +		 * use the path cache which is made for a single path. Multiple +		 * extent buffers at the current level happen when: +		 * +		 * 1) level -1, the data extent: If our data extent was not +		 *    directly shared (without multiple reference items), then +		 *    it might have a single reference item with a count > 1 for +		 *    the same offset, which means there are 2 (or more) file +		 *    extent items that point to the data extent - this happens +		 *    when a file extent item needs to be split and then one +		 *    item gets moved to another leaf due to a b+tree leaf split +		 *    when inserting some item. In this case the file extent +		 *    items may be located in different leaves and therefore +		 *    some of the leaves may be referenced through shared +		 *    subtrees while others are not. Since our extent buffer +		 *    cache only works for a single path (by far the most common +		 *    case and simpler to deal with), we can not use it if we +		 *    have multiple leaves (which implies multiple paths). +		 * +		 * 2) level >= 0, a tree node/leaf: We can have a mix of direct +		 *    and indirect references on a b+tree node/leaf, so we have +		 *    to check multiple paths, and the extent buffer (the +		 *    current bytenr) may be shared or not. One example is +		 *    during relocation as we may get a shared tree block ref +		 *    (direct ref) and a non-shared tree block ref (indirect +		 *    ref) for the same node/leaf.  		 */ -		if (level == -1 && ctx->refs.nnodes > 1) +		if ((ctx->refs.nnodes - prev_ref_count) > 1)  			ctx->use_path_cache = false;  		if (level >= 0) @@ -1964,12 +1978,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,  		if (!node)  			break;  		bytenr = node->val; -		level++; -		cached = lookup_backref_shared_cache(ctx, root, bytenr, level, -						     &is_shared); -		if (cached) { -			ret = (is_shared ? 1 : 0); -			break; +		if (ctx->use_path_cache) { +			bool is_shared; +			bool cached; + +			level++; +			cached = lookup_backref_shared_cache(ctx, root, bytenr, +							     level, &is_shared); +			if (cached) { +				ret = (is_shared ? 1 : 0); +				break; +			}  		}  		shared.share_count = 0;  		shared.have_delayed_delete_refs = false; @@ -1977,6 +1996,28 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,  	}  	/* +	 * If the path cache is disabled, then it means at some tree level we +	 * got multiple parents due to a mix of direct and indirect backrefs or +	 * multiple leaves with file extent items pointing to the same data +	 * extent. We have to invalidate the cache and cache only the sharedness +	 * result for the levels where we got only one node/reference. +	 */ +	if (!ctx->use_path_cache) { +		int i = 0; + +		level--; +		if (ret >= 0 && level >= 0) { +			bytenr = ctx->path_cache_entries[level].bytenr; +			ctx->use_path_cache = true; +			store_backref_shared_cache(ctx, root, bytenr, level, ret); +			i = level + 1; +		} + +		for ( ; i < BTRFS_MAX_LEVEL; i++) +			ctx->path_cache_entries[i].bytenr = 0; +	} + +	/*  	 * Cache the sharedness result for the data extent if we know our inode  	 * has more than 1 file extent item that refers to the data extent.  	 */ diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d8b90f95b157..726592868e9c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -287,7 +287,7 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)  	if (btrfs_op(bio) == BTRFS_MAP_WRITE)  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); -	if (!(bio->bi_opf & REQ_RAHEAD)) +	else if (!(bio->bi_opf & REQ_RAHEAD))  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);  	if (bio->bi_opf & REQ_PREFLUSH)  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5b10401d803b..5fc670c27f86 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -558,14 +558,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end  static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,  					  struct btrfs_block_group *block_group,  					  int index, int max_index, -					  struct btrfs_key *key) +					  struct btrfs_key *found_key)  {  	struct btrfs_fs_info *fs_info = block_group->fs_info;  	struct btrfs_root *extent_root; -	int ret = 0;  	u64 search_offset;  	u64 search_end = block_group->start + block_group->length;  	struct btrfs_path *path; +	struct btrfs_key search_key; +	int ret = 0;  	ASSERT(index >= 0);  	ASSERT(index <= max_index); @@ -585,37 +586,24 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_  	path->reada = READA_FORWARD;  	search_offset = index * div_u64(block_group->length, max_index); -	key->objectid = block_group->start + search_offset; -	key->type = BTRFS_EXTENT_ITEM_KEY; -	key->offset = 0; +	search_key.objectid = block_group->start + search_offset; +	search_key.type = BTRFS_EXTENT_ITEM_KEY; +	search_key.offset = 0; -	while (1) { -		ret = btrfs_search_forward(extent_root, key, path, 0); -		if (ret != 0) -			goto out; +	btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {  		/* Success; sampled an extent item in the block group */ -		if (key->type == BTRFS_EXTENT_ITEM_KEY && -		    key->objectid >= block_group->start && -		    key->objectid + key->offset <= search_end) -			goto out; +		if (found_key->type == BTRFS_EXTENT_ITEM_KEY && +		    found_key->objectid >= block_group->start && +		    found_key->objectid + found_key->offset <= search_end) +			break;  		/* We can't possibly find a valid extent item anymore */ -		if (key->objectid >= search_end) { +		if (found_key->objectid >= search_end) {  			ret = 1;  			break;  		} -		if (key->type < BTRFS_EXTENT_ITEM_KEY) -			key->type = BTRFS_EXTENT_ITEM_KEY; -		else -			key->objectid++; -		btrfs_release_path(path); -		up_read(&fs_info->commit_root_sem); -		mutex_unlock(&caching_ctl->mutex); -		cond_resched(); -		mutex_lock(&caching_ctl->mutex); -		down_read(&fs_info->commit_root_sem);  	} -out: +  	lockdep_assert_held(&caching_ctl->mutex);  	lockdep_assert_held_read(&fs_info->commit_root_sem);  	btrfs_free_path(path); @@ -659,6 +647,7 @@ out:  static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,  				       struct btrfs_block_group *block_group)  { +	struct btrfs_fs_info *fs_info = block_group->fs_info;  	struct btrfs_key key;  	int i;  	u64 min_size = block_group->length; @@ -668,6 +657,8 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl  	if (!btrfs_block_group_should_use_size_class(block_group))  		return 0; +	lockdep_assert_held(&caching_ctl->mutex); +	lockdep_assert_held_read(&fs_info->commit_root_sem);  	for (i = 0; i < 5; ++i) {  		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);  		if (ret < 0) @@ -682,7 +673,6 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl  		block_group->size_class = size_class;  		spin_unlock(&block_group->lock);  	} -  out:  	return ret;  } @@ -1185,14 +1175,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  			< block_group->zone_unusable);  		WARN_ON(block_group->space_info->disk_total  			< block_group->length * factor); -		WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, -				 &block_group->runtime_flags) && -			block_group->space_info->active_total_bytes -			< block_group->length);  	}  	block_group->space_info->total_bytes -= block_group->length; -	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) -		block_group->space_info->active_total_bytes -= block_group->length;  	block_group->space_info->bytes_readonly -=  		(block_group->length - block_group->zone_unusable);  	block_group->space_info->bytes_zone_unusable -= @@ -1836,7 +1820,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)  		btrfs_info(fs_info,  			"reclaiming chunk %llu with %llu%% used %llu%% unusable", -				bg->start, div_u64(bg->used * 100, bg->length), +				bg->start, +				div64_u64(bg->used * 100, bg->length),  				div64_u64(zone_unusable * 100, bg->length));  		trace_btrfs_reclaim_block_group(bg);  		ret = btrfs_relocate_chunk(fs_info, bg->start); @@ -2493,18 +2478,29 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,  	struct btrfs_block_group_item bgi;  	struct btrfs_root *root = btrfs_block_group_root(fs_info);  	struct btrfs_key key; +	u64 old_commit_used; +	int ret;  	spin_lock(&block_group->lock);  	btrfs_set_stack_block_group_used(&bgi, block_group->used);  	btrfs_set_stack_block_group_chunk_objectid(&bgi,  						   block_group->global_root_id);  	btrfs_set_stack_block_group_flags(&bgi, block_group->flags); +	old_commit_used = block_group->commit_used; +	block_group->commit_used = block_group->used;  	key.objectid = block_group->start;  	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;  	key.offset = block_group->length;  	spin_unlock(&block_group->lock); -	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); +	ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); +	if (ret < 0) { +		spin_lock(&block_group->lock); +		block_group->commit_used = old_commit_used; +		spin_unlock(&block_group->lock); +	} + +	return ret;  }  static int insert_dev_extent(struct btrfs_trans_handle *trans, @@ -3474,6 +3470,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  	spin_unlock(&info->delalloc_root_lock);  	while (total) { +		struct btrfs_space_info *space_info;  		bool reclaim = false;  		cache = btrfs_lookup_block_group(info, bytenr); @@ -3481,6 +3478,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  			ret = -ENOENT;  			break;  		} +		space_info = cache->space_info;  		factor = btrfs_bg_type_to_factor(cache->flags);  		/* @@ -3495,7 +3493,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  		byte_in_group = bytenr - cache->start;  		WARN_ON(byte_in_group > cache->length); -		spin_lock(&cache->space_info->lock); +		spin_lock(&space_info->lock);  		spin_lock(&cache->lock);  		if (btrfs_test_opt(info, SPACE_CACHE) && @@ -3508,24 +3506,24 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,  			old_val += num_bytes;  			cache->used = old_val;  			cache->reserved -= num_bytes; -			cache->space_info->bytes_reserved -= num_bytes; -			cache->space_info->bytes_used += num_bytes; -			cache->space_info->disk_used += num_bytes * factor; +			space_info->bytes_reserved -= num_bytes; +			space_info->bytes_used += num_bytes; +			space_info->disk_used += num_bytes * factor;  			spin_unlock(&cache->lock); -			spin_unlock(&cache->space_info->lock); +			spin_unlock(&space_info->lock);  		} else {  			old_val -= num_bytes;  			cache->used = old_val;  			cache->pinned += num_bytes; -			btrfs_space_info_update_bytes_pinned(info, -					cache->space_info, num_bytes); -			cache->space_info->bytes_used -= num_bytes; -			cache->space_info->disk_used -= num_bytes * factor; +			btrfs_space_info_update_bytes_pinned(info, space_info, +							     num_bytes); +			space_info->bytes_used -= num_bytes; +			space_info->disk_used -= num_bytes * factor;  			reclaim = should_reclaim_block_group(cache, num_bytes);  			spin_unlock(&cache->lock); -			spin_unlock(&cache->space_info->lock); +			spin_unlock(&space_info->lock);  			set_extent_dirty(&trans->transaction->pinned_extents,  					 bytenr, bytenr + num_bytes - 1, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0095c6e4c3d1..6b457b010cbc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1048,7 +1048,7 @@ again:  	 * so there is only one iref. The case that several irefs are  	 * in the same item doesn't exist.  	 */ -	btrfs_del_item(trans, root, path); +	ret = btrfs_del_item(trans, root, path);  out:  	btrfs_release_delayed_iref(node);  	btrfs_release_path(path); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b53f0e30ce2b..9e1596bb208d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2250,6 +2250,20 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)  	fs_info->csum_shash = csum_shash; +	/* +	 * Check if the checksum implementation is a fast accelerated one. +	 * As-is this is a bit of a hack and should be replaced once the csum +	 * implementations provide that information themselves. +	 */ +	switch (csum_type) { +	case BTRFS_CSUM_TYPE_CRC32: +		if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) +			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); +		break; +	default: +		break; +	} +  	btrfs_info(fs_info, "using %s (%s) checksum algorithm",  			btrfs_super_csum_name(csum_type),  			crypto_shash_driver_name(csum_shash)); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index be94030e1dfb..138afa955370 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -763,7 +763,13 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,  			goto next;  		} +		flags = em->flags;  		clear_bit(EXTENT_FLAG_PINNED, &em->flags); +		/* +		 * In case we split the extent map, we want to preserve the +		 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want +		 * it on the new extent maps. +		 */  		clear_bit(EXTENT_FLAG_LOGGING, &flags);  		modified = !list_empty(&em->list); @@ -774,7 +780,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,  		if (em->start >= start && em_end <= end)  			goto remove_em; -		flags = em->flags;  		gen = em->generation;  		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0d250d052487..d84cef89cdff 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2693,8 +2693,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,  		bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);  	spin_lock(&ctl->tree_lock); +	/* Count initial region as zone_unusable until it gets activated. */  	if (!used)  		to_free = size; +	else if (initial && +		 test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && +		 (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) +		to_free = 0;  	else if (initial)  		to_free = block_group->zone_capacity;  	else if (offset >= block_group->alloc_offset) @@ -2722,7 +2727,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,  	reclaimable_unusable = block_group->zone_unusable -  			       (block_group->length - block_group->zone_capacity);  	/* All the region is now unusable. Mark it as unused and reclaim */ -	if (block_group->zone_unusable == block_group->length) { +	if (block_group->zone_unusable == block_group->length && +	    block_group->alloc_offset) {  		btrfs_mark_bg_unused(block_group);  	} else if (bg_reclaim_threshold &&  		   reclaimable_unusable >= diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4c477eae6891..24cd49229408 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -120,11 +120,8 @@ enum {  	/* Indicate that we want to commit the transaction. */  	BTRFS_FS_NEED_TRANS_COMMIT, -	/* -	 * Indicate metadata over-commit is disabled. This is set when active -	 * zone tracking is needed. -	 */ -	BTRFS_FS_NO_OVERCOMMIT, +	/* This is set when active zone tracking is needed. */ +	BTRFS_FS_ACTIVE_ZONE_TRACKING,  	/*  	 * Indicate if we have some features changed, this is mostly for diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6c18dc9a1831..957e4d76a7b6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5421,8 +5421,13 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,  		return -ENOMEM;  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); -	if (ret) +	if (ret < 0)  		goto out; +	/* +	 * fscrypt_setup_filename() should never return a positive value, but +	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen. +	 */ +	ASSERT(ret == 0);  	/* This needs to handle no-key deletions later on */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 84626c8ad5bf..ba769a1eb87a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2859,6 +2859,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,  	di_args->bytes_used = btrfs_device_get_bytes_used(dev);  	di_args->total_bytes = btrfs_device_get_total_bytes(dev);  	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); +	memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);  	if (dev->name)  		strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));  	else @@ -3731,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)  	}  	/* update qgroup status and info */ +	mutex_lock(&fs_info->qgroup_ioctl_lock);  	err = btrfs_run_qgroups(trans); +	mutex_unlock(&fs_info->qgroup_ioctl_lock);  	if (err < 0)  		btrfs_handle_fs_error(fs_info, err,  				      "failed to update qgroup status and info"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 52a7d2fa2284..f41da7ac360d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2828,13 +2828,22 @@ cleanup:  }  /* - * called from commit_transaction. Writes all changed qgroups to disk. + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl.   */  int btrfs_run_qgroups(struct btrfs_trans_handle *trans)  {  	struct btrfs_fs_info *fs_info = trans->fs_info;  	int ret = 0; +	/* +	 * In case we are called from the qgroup assign ioctl, assert that we +	 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota +	 * disable operation (ioctl) and access a freed quota root. +	 */ +	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) +		lockdep_assert_held(&fs_info->qgroup_ioctl_lock); +  	if (!fs_info->quota_root)  		return ret; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 69c09508afb5..3eecce86f63f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -308,8 +308,6 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,  	ASSERT(found);  	spin_lock(&found->lock);  	found->total_bytes += block_group->length; -	if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) -		found->active_total_bytes += block_group->length;  	found->disk_total += block_group->length * factor;  	found->bytes_used += block_group->used;  	found->disk_used += block_group->used * factor; @@ -379,22 +377,6 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,  	return avail;  } -static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, -				       struct btrfs_space_info *space_info) -{ -	/* -	 * On regular filesystem, all total_bytes are always writable. On zoned -	 * filesystem, there may be a limitation imposed by max_active_zones. -	 * For metadata allocation, we cannot finish an existing active block -	 * group to avoid a deadlock. Thus, we need to consider only the active -	 * groups to be writable for metadata space. -	 */ -	if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) -		return space_info->total_bytes; - -	return space_info->active_total_bytes; -} -  int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,  			 struct btrfs_space_info *space_info, u64 bytes,  			 enum btrfs_reserve_flush_enum flush) @@ -407,13 +389,13 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,  		return 0;  	used = btrfs_space_info_used(space_info, true); -	if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && +	if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) &&  	    (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))  		avail = 0;  	else  		avail = calc_available_free_space(fs_info, space_info, flush); -	if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) +	if (used + bytes < space_info->total_bytes + avail)  		return 1;  	return 0;  } @@ -449,7 +431,7 @@ again:  		ticket = list_first_entry(head, struct reserve_ticket, list);  		/* Check and see if our ticket can be satisfied now. */ -		if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || +		if ((used + ticket->bytes <= space_info->total_bytes) ||  		    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,  					 flush)) {  			btrfs_space_info_update_bytes_may_use(fs_info, @@ -829,7 +811,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,  {  	u64 used;  	u64 avail; -	u64 total;  	u64 to_reclaim = space_info->reclaim_size;  	lockdep_assert_held(&space_info->lock); @@ -844,9 +825,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,  	 * space.  If that's the case add in our overage so we make sure to put  	 * appropriate pressure on the flushing state machine.  	 */ -	total = writable_total_bytes(fs_info, space_info); -	if (total + avail < used) -		to_reclaim += used - (total + avail); +	if (space_info->total_bytes + avail < used) +		to_reclaim += used - (space_info->total_bytes + avail);  	return to_reclaim;  } @@ -856,11 +836,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,  {  	u64 global_rsv_size = fs_info->global_block_rsv.reserved;  	u64 ordered, delalloc; -	u64 total = writable_total_bytes(fs_info, space_info);  	u64 thresh;  	u64 used; -	thresh = mult_perc(total, 90); +	thresh = mult_perc(space_info->total_bytes, 90);  	lockdep_assert_held(&space_info->lock); @@ -923,8 +902,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,  					   BTRFS_RESERVE_FLUSH_ALL);  	used = space_info->bytes_used + space_info->bytes_reserved +  	       space_info->bytes_readonly + global_rsv_size; -	if (used < total) -		thresh += total - used; +	if (used < space_info->total_bytes) +		thresh += space_info->total_bytes - used;  	thresh >>= space_info->clamp;  	used = space_info->bytes_pinned; @@ -1651,7 +1630,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,  	 * can_overcommit() to ensure we can overcommit to continue.  	 */  	if (!pending_tickets && -	    ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || +	    ((used + orig_bytes <= space_info->total_bytes) ||  	     btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {  		btrfs_space_info_update_bytes_may_use(fs_info, space_info,  						      orig_bytes); @@ -1665,8 +1644,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,  	 */  	if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {  		used = btrfs_space_info_used(space_info, false); -		if (used + orig_bytes <= -		    writable_total_bytes(fs_info, space_info)) { +		if (used + orig_bytes <= space_info->total_bytes) {  			btrfs_space_info_update_bytes_may_use(fs_info, space_info,  							      orig_bytes);  			ret = 0; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index fc99ea2b0c34..2033b71b18ce 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -96,8 +96,6 @@ struct btrfs_space_info {  	u64 bytes_may_use;	/* number of bytes that may be used for  				   delalloc/allocations */  	u64 bytes_readonly;	/* total bytes that are read only */ -	/* Total bytes in the space, but only accounts active block groups. */ -	u64 active_total_bytes;  	u64 bytes_zone_unusable;	/* total bytes that are unusable until  					   resetting the device zone */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 581845bc206a..366fb4cde145 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1516,8 +1516,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,  		shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,  					s->s_id);  		btrfs_sb(s)->bdev_holder = fs_type; -		if (!strstr(crc32c_impl(), "generic")) -			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);  		error = btrfs_fill_super(s, fs_devices, data);  	}  	if (!error) @@ -1631,6 +1629,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,  	btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);  	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);  	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); +	workqueue_set_max_active(fs_info->endio_workers, new_pool_size); +	workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size);  	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);  	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);  	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8c5efa5813b3..37fc58a7f27e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -9,6 +9,7 @@  #include <linux/spinlock.h>  #include <linux/completion.h>  #include <linux/bug.h> +#include <linux/list.h>  #include <crypto/hash.h>  #include "messages.h"  #include "ctree.h" @@ -778,6 +779,45 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,  	return len;  } +static ssize_t btrfs_size_classes_show(struct kobject *kobj, +				       struct kobj_attribute *a, char *buf) +{ +	struct btrfs_space_info *sinfo = to_space_info(kobj); +	struct btrfs_block_group *bg; +	u32 none = 0; +	u32 small = 0; +	u32 medium = 0; +	u32 large = 0; + +	for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { +		down_read(&sinfo->groups_sem); +		list_for_each_entry(bg, &sinfo->block_groups[i], list) { +			if (!btrfs_block_group_should_use_size_class(bg)) +				continue; +			switch (bg->size_class) { +			case BTRFS_BG_SZ_NONE: +				none++; +				break; +			case BTRFS_BG_SZ_SMALL: +				small++; +				break; +			case BTRFS_BG_SZ_MEDIUM: +				medium++; +				break; +			case BTRFS_BG_SZ_LARGE: +				large++; +				break; +			} +		} +		up_read(&sinfo->groups_sem); +	} +	return sysfs_emit(buf, "none %u\n" +			       "small %u\n" +			       "medium %u\n" +			       "large %u\n", +			       none, small, medium, large); +} +  #ifdef CONFIG_BTRFS_DEBUG  /*   * Request chunk allocation with current chunk size. @@ -835,6 +875,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);  SPACE_INFO_ATTR(disk_used);  SPACE_INFO_ATTR(disk_total);  BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);  static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,  						     struct kobj_attribute *a, @@ -887,6 +928,7 @@ static struct attribute *space_info_attrs[] = {  	BTRFS_ATTR_PTR(space_info, disk_total),  	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),  	BTRFS_ATTR_PTR(space_info, chunk_size), +	BTRFS_ATTR_PTR(space_info, size_classes),  #ifdef CONFIG_BTRFS_DEBUG  	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),  #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 18329ebcb1cb..b8d5b1fa9a03 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2035,7 +2035,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)  	if (current->journal_info == trans)  		current->journal_info = NULL; -	btrfs_scrub_cancel(fs_info); + +	/* +	 * If relocation is running, we can't cancel scrub because that will +	 * result in a deadlock. Before relocating a block group, relocation +	 * pauses scrub, then starts and commits a transaction before unpausing +	 * scrub. If the transaction commit is being done by the relocation +	 * task or triggered by another task and the relocation task is waiting +	 * for the commit, and we end up here due to an error in the commit +	 * path, then calling btrfs_scrub_cancel() will deadlock, as we are +	 * asking for scrub to stop while having it asked to be paused higher +	 * above in relocation code. +	 */ +	if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) +		btrfs_scrub_cancel(fs_info);  	kmem_cache_free(btrfs_trans_handle_cachep, trans);  } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7823168c08a6..c6d592870400 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,  	 * So, we need to add a special mount option to scan for  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead  	 */ -	flags |= FMODE_EXCL; +	/* +	 * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may +	 * initiate the device scan which may race with the user's mount +	 * or mkfs command, resulting in failure. +	 * Since the device scan is solely for reading purposes, there is +	 * no need for FMODE_EXCL. Additionally, the devices are read again +	 * during the mount process. It is ok to get some inconsistent +	 * values temporarily, as the device paths of the fsid are the only +	 * required information for assembling the volume. +	 */  	bdev = blkdev_get_by_path(path, flags, holder);  	if (IS_ERR(bdev))  		return ERR_CAST(bdev); @@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)  	btrfs_scrub_pause(fs_info);  	ret = btrfs_relocate_block_group(fs_info, chunk_offset);  	btrfs_scrub_continue(fs_info); -	if (ret) +	if (ret) { +		/* +		 * If we had a transaction abort, stop all running scrubs. +		 * See transaction.c:cleanup_transaction() why we do it here. +		 */ +		if (BTRFS_FS_ERROR(fs_info)) +			btrfs_scrub_cancel(fs_info);  		return ret; +	}  	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);  	if (!block_group) @@ -6363,7 +6379,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,  	ASSERT(op != BTRFS_MAP_DISCARD);  	em = btrfs_get_chunk_map(fs_info, logical, *length); -	ASSERT(!IS_ERR(em)); +	if (IS_ERR(em)) +		return PTR_ERR(em);  	map = em->map_lookup;  	data_stripes = nr_data_stripes(map); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f95b2c94d619..45d04092f2f8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -524,8 +524,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)  		}  		atomic_set(&zone_info->active_zones_left,  			   max_active_zones - nactive); -		/* Overcommit does not work well with active zone tacking. */ -		set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); +		set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);  	}  	/* Validate superblock log */ @@ -1581,9 +1580,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)  		return;  	WARN_ON(cache->bytes_super != 0); -	unusable = (cache->alloc_offset - cache->used) + -		   (cache->length - cache->zone_capacity); -	free = cache->zone_capacity - cache->alloc_offset; + +	/* Check for block groups never get activated */ +	if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && +	    cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && +	    !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && +	    cache->alloc_offset == 0) { +		unusable = cache->length; +		free = 0; +	} else { +		unusable = (cache->alloc_offset - cache->used) + +			   (cache->length - cache->zone_capacity); +		free = cache->zone_capacity - cache->alloc_offset; +	}  	/* We only need ->free_space in ALLOC_SEQ block groups */  	cache->cached = BTRFS_CACHE_FINISHED; @@ -1902,7 +1911,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)  	/* Successfully activated all the zones */  	set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); -	space_info->active_total_bytes += block_group->length; +	WARN_ON(block_group->alloc_offset != 0); +	if (block_group->zone_unusable == block_group->length) { +		block_group->zone_unusable = block_group->length - block_group->zone_capacity; +		space_info->bytes_zone_unusable -= block_group->zone_capacity; +	}  	spin_unlock(&block_group->lock);  	btrfs_try_granting_tickets(fs_info, space_info);  	spin_unlock(&space_info->lock); @@ -2086,11 +2099,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)  		if (!device->bdev)  			continue; -		if (!zinfo->max_active_zones || -		    atomic_read(&zinfo->active_zones_left)) { +		if (!zinfo->max_active_zones) {  			ret = true;  			break;  		} + +		switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { +		case 0: /* single */ +			ret = (atomic_read(&zinfo->active_zones_left) >= 1); +			break; +		case BTRFS_BLOCK_GROUP_DUP: +			ret = (atomic_read(&zinfo->active_zones_left) >= 2); +			break; +		} +		if (ret) +			break;  	}  	mutex_unlock(&fs_info->chunk_mutex); @@ -2256,7 +2279,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)  		u64 avail;  		spin_lock(&block_group->lock); -		if (block_group->reserved || +		if (block_group->reserved || block_group->alloc_offset == 0 ||  		    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {  			spin_unlock(&block_group->lock);  			continue; @@ -2293,10 +2316,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,  	if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))  		return 0; -	/* No more block groups to activate */ -	if (space_info->active_total_bytes == space_info->total_bytes) -		return 0; -  	for (;;) {  		int ret;  		bool need_finish = false; |