diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 211 | 
1 files changed, 159 insertions, 52 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47c1ba141082..222d6aea4a8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)  	struct btrfs_caching_control *ctl;  	spin_lock(&cache->lock); -	if (cache->cached != BTRFS_CACHE_STARTED) { -		spin_unlock(&cache->lock); -		return NULL; -	} - -	/* We're loading it the fast way, so we don't have a caching_ctl. */  	if (!cache->caching_ctl) {  		spin_unlock(&cache->lock);  		return NULL; @@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  	spin_unlock(&cache->lock);  	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { +		mutex_lock(&caching_ctl->mutex);  		ret = load_free_space_cache(fs_info, cache);  		spin_lock(&cache->lock); @@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  			cache->caching_ctl = NULL;  			cache->cached = BTRFS_CACHE_FINISHED;  			cache->last_byte_to_unpin = (u64)-1; +			caching_ctl->progress = (u64)-1;  		} else {  			if (load_cache_only) {  				cache->caching_ctl = NULL;  				cache->cached = BTRFS_CACHE_NO;  			} else {  				cache->cached = BTRFS_CACHE_STARTED; +				cache->has_caching_ctl = 1;  			}  		}  		spin_unlock(&cache->lock); +		mutex_unlock(&caching_ctl->mutex); +  		wake_up(&caching_ctl->wait);  		if (ret == 1) {  			put_caching_control(caching_ctl); @@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,  			cache->cached = BTRFS_CACHE_NO;  		} else {  			cache->cached = BTRFS_CACHE_STARTED; +			cache->has_caching_ctl = 1;  		}  		spin_unlock(&cache->lock);  		wake_up(&caching_ctl->wait); @@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,  		 struct btrfs_block_group_cache *cache)  {  	struct rb_node *node; +  	spin_lock(&root->fs_info->block_group_cache_lock); + +	/* If our block group was removed, we need a full search. */ +	if (RB_EMPTY_NODE(&cache->cache_node)) { +		const u64 next_bytenr = cache->key.objectid + cache->key.offset; + +		spin_unlock(&root->fs_info->block_group_cache_lock); +		btrfs_put_block_group(cache); +		cache = btrfs_lookup_first_block_group(root->fs_info, +						       next_bytenr); +		return cache; +	}  	node = rb_next(&cache->cache_node);  	btrfs_put_block_group(cache);  	if (node) { @@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,  	found->chunk_alloc = 0;  	found->flush = 0;  	init_waitqueue_head(&found->wait); +	INIT_LIST_HEAD(&found->ro_bgs);  	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,  				    info->space_info_kobj, "%s", @@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,  			spin_unlock(&cache->space_info->lock);  		} else {  			old_val -= num_bytes; +			btrfs_set_block_group_used(&cache->item, old_val); +			cache->pinned += num_bytes; +			cache->space_info->bytes_pinned += num_bytes; +			cache->space_info->bytes_used -= num_bytes; +			cache->space_info->disk_used -= num_bytes * factor; +			spin_unlock(&cache->lock); +			spin_unlock(&cache->space_info->lock); +			set_extent_dirty(info->pinned_extents, +					 bytenr, bytenr + num_bytes - 1, +					 GFP_NOFS | __GFP_NOFAIL);  			/*  			 * No longer have used bytes in this block group, queue  			 * it for deletion. @@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,  				}  				spin_unlock(&info->unused_bgs_lock);  			} -			btrfs_set_block_group_used(&cache->item, old_val); -			cache->pinned += num_bytes; -			cache->space_info->bytes_pinned += num_bytes; -			cache->space_info->bytes_used -= num_bytes; -			cache->space_info->disk_used -= num_bytes * factor; -			spin_unlock(&cache->lock); -			spin_unlock(&cache->space_info->lock); - -			set_extent_dirty(info->pinned_extents, -					 bytenr, bytenr + num_bytes - 1, -					 GFP_NOFS | __GFP_NOFAIL);  		}  		btrfs_put_block_group(cache);  		total -= num_bytes; @@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)  	    min_allocable_bytes <= sinfo->total_bytes) {  		sinfo->bytes_readonly += num_bytes;  		cache->ro = 1; +		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);  		ret = 0;  	}  out: @@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,  /*   * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account.   */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)  {  	struct btrfs_block_group_cache *block_group;  	u64 free_bytes = 0;  	int factor; -	list_for_each_entry(block_group, groups_list, list) { +	/* It's df, we don't care if it's racey */ +	if (list_empty(&sinfo->ro_bgs)) +		return 0; + +	spin_lock(&sinfo->lock); +	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {  		spin_lock(&block_group->lock);  		if (!block_group->ro) { @@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)  		spin_unlock(&block_group->lock);  	} - -	return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ -	int i; -	u64 free_bytes = 0; - -	spin_lock(&sinfo->lock); - -	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) -		if (!list_empty(&sinfo->block_groups[i])) -			free_bytes += __btrfs_get_ro_block_group_free_space( -						&sinfo->block_groups[i]); -  	spin_unlock(&sinfo->lock);  	return free_bytes; @@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,  		    cache->bytes_super - btrfs_block_group_used(&cache->item);  	sinfo->bytes_readonly -= num_bytes;  	cache->ro = 0; +	list_del_init(&cache->ro_list);  	spin_unlock(&cache->lock);  	spin_unlock(&sinfo->lock);  } @@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)  	INIT_LIST_HEAD(&cache->list);  	INIT_LIST_HEAD(&cache->cluster_list);  	INIT_LIST_HEAD(&cache->bg_list); +	INIT_LIST_HEAD(&cache->ro_list);  	btrfs_init_free_space_ctl(cache); +	atomic_set(&cache->trimming, 0);  	return cache;  } @@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,  	int ret = 0;  	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { -		list_del_init(&block_group->bg_list);  		if (ret) -			continue; +			goto next;  		spin_lock(&block_group->lock);  		memcpy(&item, &block_group->item, sizeof(item)); @@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,  					       key.objectid, key.offset);  		if (ret)  			btrfs_abort_transaction(trans, extent_root, ret); +next: +		list_del_init(&block_group->bg_list);  	}  } @@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)  }  int btrfs_remove_block_group(struct btrfs_trans_handle *trans, -			     struct btrfs_root *root, u64 group_start) +			     struct btrfs_root *root, u64 group_start, +			     struct extent_map *em)  {  	struct btrfs_path *path;  	struct btrfs_block_group_cache *block_group; @@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	int ret;  	int index;  	int factor; +	struct btrfs_caching_control *caching_ctl = NULL; +	bool remove_em;  	root = root->fs_info->extent_root; @@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	spin_lock(&root->fs_info->block_group_cache_lock);  	rb_erase(&block_group->cache_node,  		 &root->fs_info->block_group_cache_tree); +	RB_CLEAR_NODE(&block_group->cache_node);  	if (root->fs_info->first_logical_byte == block_group->key.objectid)  		root->fs_info->first_logical_byte = (u64)-1; @@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	 * are still on the list after taking the semaphore  	 */  	list_del_init(&block_group->list); +	list_del_init(&block_group->ro_list);  	if (list_empty(&block_group->space_info->block_groups[index])) {  		kobj = block_group->space_info->block_group_kobjs[index];  		block_group->space_info->block_group_kobjs[index] = NULL; @@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  		kobject_put(kobj);  	} +	if (block_group->has_caching_ctl) +		caching_ctl = get_caching_control(block_group);  	if (block_group->cached == BTRFS_CACHE_STARTED)  		wait_block_group_cache_done(block_group); +	if (block_group->has_caching_ctl) { +		down_write(&root->fs_info->commit_root_sem); +		if (!caching_ctl) { +			struct btrfs_caching_control *ctl; + +			list_for_each_entry(ctl, +				    &root->fs_info->caching_block_groups, list) +				if (ctl->block_group == block_group) { +					caching_ctl = ctl; +					atomic_inc(&caching_ctl->count); +					break; +				} +		} +		if (caching_ctl) +			list_del_init(&caching_ctl->list); +		up_write(&root->fs_info->commit_root_sem); +		if (caching_ctl) { +			/* Once for the caching bgs list and once for us. */ +			put_caching_control(caching_ctl); +			put_caching_control(caching_ctl); +		} +	}  	btrfs_remove_free_space_cache(block_group); @@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,  	memcpy(&key, &block_group->key, sizeof(key)); +	lock_chunks(root); +	if (!list_empty(&em->list)) { +		/* We're in the transaction->pending_chunks list. */ +		free_extent_map(em); +	} +	spin_lock(&block_group->lock); +	block_group->removed = 1; +	/* +	 * At this point trimming can't start on this block group, because we +	 * removed the block group from the tree fs_info->block_group_cache_tree +	 * so no one can't find it anymore and even if someone already got this +	 * block group before we removed it from the rbtree, they have already +	 * incremented block_group->trimming - if they didn't, they won't find +	 * any free space entries because we already removed them all when we +	 * called btrfs_remove_free_space_cache(). +	 * +	 * And we must not remove the extent map from the fs_info->mapping_tree +	 * to prevent the same logical address range and physical device space +	 * ranges from being reused for a new block group. This is because our +	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is +	 * completely transactionless, so while it is trimming a range the +	 * currently running transaction might finish and a new one start, +	 * allowing for new block groups to be created that can reuse the same +	 * physical device locations unless we take this special care. +	 */ +	remove_em = (atomic_read(&block_group->trimming) == 0); +	/* +	 * Make sure a trimmer task always sees the em in the pinned_chunks list +	 * if it sees block_group->removed == 1 (needs to lock block_group->lock +	 * before checking block_group->removed). +	 */ +	if (!remove_em) { +		/* +		 * Our em might be in trans->transaction->pending_chunks which +		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), +		 * and so is the fs_info->pinned_chunks list. +		 * +		 * So at this point we must be holding the chunk_mutex to avoid +		 * any races with chunk allocation (more specifically at +		 * volumes.c:contains_pending_extent()), to ensure it always +		 * sees the em, either in the pending_chunks list or in the +		 * pinned_chunks list. +		 */ +		list_move_tail(&em->list, &root->fs_info->pinned_chunks); +	} +	spin_unlock(&block_group->lock); + +	if (remove_em) { +		struct extent_map_tree *em_tree; + +		em_tree = &root->fs_info->mapping_tree.map_tree; +		write_lock(&em_tree->lock); +		/* +		 * The em might be in the pending_chunks list, so make sure the +		 * chunk mutex is locked, since remove_extent_mapping() will +		 * delete us from that list. +		 */ +		remove_extent_mapping(em_tree, em); +		write_unlock(&em_tree->lock); +		/* once for the tree */ +		free_extent_map(em); +	} + +	unlock_chunks(root); +  	btrfs_put_block_group(block_group);  	btrfs_put_block_group(block_group); @@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		 */  		start = block_group->key.objectid;  		end = start + block_group->key.offset - 1; -		clear_extent_bits(&fs_info->freed_extents[0], start, end, +		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,  				  EXTENT_DIRTY, GFP_NOFS); -		clear_extent_bits(&fs_info->freed_extents[1], start, end, +		if (ret) { +			btrfs_set_block_group_rw(root, block_group); +			goto end_trans; +		} +		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,  				  EXTENT_DIRTY, GFP_NOFS); +		if (ret) { +			btrfs_set_block_group_rw(root, block_group); +			goto end_trans; +		}  		/* Reset pinned so btrfs_put_block_group doesn't complain */  		block_group->pinned = 0; @@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		 */  		ret = btrfs_remove_chunk(trans, root,  					 block_group->key.objectid); +end_trans:  		btrfs_end_transaction(trans, root);  next:  		btrfs_put_block_group(block_group); @@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)  }  /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example).   */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root)  {  	percpu_counter_dec(&root->subv_writers->counter);  	/* @@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)  		wake_up(&root->subv_writers->wait);  } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root)  {  	if (atomic_read(&root->will_be_snapshoted))  		return 0; @@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)  	 */  	smp_mb();  	if (atomic_read(&root->will_be_snapshoted)) { -		btrfs_end_nocow_write(root); +		btrfs_end_write_no_snapshoting(root);  		return 0;  	}  	return 1;  |