diff options
Diffstat (limited to 'fs/btrfs/extent-tree.c')
| -rw-r--r-- | fs/btrfs/extent-tree.c | 248 | 
1 files changed, 188 insertions, 60 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..82b912a293ab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root,  {  	u64 end = start + num_bytes - 1;  	set_extent_bits(&root->fs_info->freed_extents[0], -			start, end, EXTENT_UPTODATE, GFP_NOFS); +			start, end, EXTENT_UPTODATE);  	set_extent_bits(&root->fs_info->freed_extents[1], -			start, end, EXTENT_UPTODATE, GFP_NOFS); +			start, end, EXTENT_UPTODATE);  	return 0;  } @@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root,  	end = start + cache->key.offset - 1;  	clear_extent_bits(&root->fs_info->freed_extents[0], -			  start, end, EXTENT_UPTODATE, GFP_NOFS); +			  start, end, EXTENT_UPTODATE);  	clear_extent_bits(&root->fs_info->freed_extents[1], -			  start, end, EXTENT_UPTODATE, GFP_NOFS); +			  start, end, EXTENT_UPTODATE);  }  static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +980,7 @@ out_free:   * event that tree block loses its owner tree's reference and do the   * back refs conversion.   * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases:   *   * The reference count of the block is one and the tree is the block's   * owner tree. Nothing to do in this case. @@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  	struct btrfs_bio *bbio = NULL; +	/* +	 * Avoid races with device replace and make sure our bbio has devices +	 * associated to its stripes that don't go away while we are discarding. +	 */ +	btrfs_bio_counter_inc_blocked(root->fs_info);  	/* Tell the block device(s) that the sectors can be discarded */  	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,  			      bytenr, &num_bytes, &bbio, 0); @@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,  		}  		btrfs_put_bbio(bbio);  	} +	btrfs_bio_counter_dec(root->fs_info);  	if (actual_bytes)  		*actual_bytes = discarded_bytes; @@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,  			}  			/* -			 * Need to drop our head ref lock and re-aqcuire the +			 * Need to drop our head ref lock and re-acquire the  			 * delayed ref lock and then re-check to make sure  			 * nobody got added.  			 */ @@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)  	/*  	 * We don't ever fill up leaves all the way so multiply by 2 just to be -	 * closer to what we're really going to want to ouse. +	 * closer to what we're really going to want to use.  	 */  	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));  } @@ -2829,6 +2835,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,  struct async_delayed_refs {  	struct btrfs_root *root; +	u64 transid;  	int count;  	int error;  	int sync; @@ -2844,6 +2851,10 @@ static void delayed_ref_async_start(struct btrfs_work *work)  	async = container_of(work, struct async_delayed_refs, work); +	/* if the commit is already started, we don't need to wait here */ +	if (btrfs_transaction_blocked(async->root->fs_info)) +		goto done; +  	trans = btrfs_join_transaction(async->root);  	if (IS_ERR(trans)) {  		async->error = PTR_ERR(trans); @@ -2851,14 +2862,19 @@ static void delayed_ref_async_start(struct btrfs_work *work)  	}  	/* -	 * trans->sync means that when we call end_transaciton, we won't +	 * trans->sync means that when we call end_transaction, we won't  	 * wait on delayed refs  	 */  	trans->sync = true; + +	/* Don't bother flushing if we got into a different transaction */ +	if (trans->transid > async->transid) +		goto end; +  	ret = btrfs_run_delayed_refs(trans, async->root, async->count);  	if (ret)  		async->error = ret; - +end:  	ret = btrfs_end_transaction(trans, async->root);  	if (ret && !async->error)  		async->error = ret; @@ -2870,7 +2886,7 @@ done:  }  int btrfs_async_run_delayed_refs(struct btrfs_root *root, -				 unsigned long count, int wait) +				 unsigned long count, u64 transid, int wait)  {  	struct async_delayed_refs *async;  	int ret; @@ -2882,6 +2898,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,  	async->root = root->fs_info->tree_root;  	async->count = count;  	async->error = 0; +	async->transid = transid;  	if (wait)  		async->sync = 1;  	else @@ -3824,6 +3841,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)  	return readonly;  } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ +	struct btrfs_block_group_cache *bg; +	bool ret = true; + +	bg = btrfs_lookup_block_group(fs_info, bytenr); +	if (!bg) +		return false; + +	spin_lock(&bg->lock); +	if (bg->ro) +		ret = false; +	else +		atomic_inc(&bg->nocow_writers); +	spin_unlock(&bg->lock); + +	/* no put on block group, done by btrfs_dec_nocow_writers */ +	if (!ret) +		btrfs_put_block_group(bg); + +	return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ +	struct btrfs_block_group_cache *bg; + +	bg = btrfs_lookup_block_group(fs_info, bytenr); +	ASSERT(bg); +	if (atomic_dec_and_test(&bg->nocow_writers)) +		wake_up_atomic_t(&bg->nocow_writers); +	/* +	 * Once for our lookup and once for the lookup done by a previous call +	 * to btrfs_inc_nocow_writers() +	 */ +	btrfs_put_block_group(bg); +	btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ +	schedule(); +	return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ +	wait_on_atomic_t(&bg->nocow_writers, +			 btrfs_wait_nocow_writers_atomic_t, +			 TASK_UNINTERRUPTIBLE); +} +  static const char *alloc_name(u64 flags)  {  	switch (flags) { @@ -4141,7 +4211,7 @@ commit_trans:  			if (need_commit > 0) {  				btrfs_start_delalloc_roots(fs_info, 0, -1); -				btrfs_wait_ordered_roots(fs_info, -1); +				btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);  			}  			trans = btrfs_join_transaction(root); @@ -4243,7 +4313,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,   * Called if we need to clear a data reservation for this inode   * Normally in a error case.   * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved   * space framework.   */  void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4583,7 +4653,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,  		 */  		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);  		if (!current->journal_info) -			btrfs_wait_ordered_roots(root->fs_info, nr_items); +			btrfs_wait_ordered_roots(root->fs_info, nr_items, +						 0, (u64)-1);  	}  } @@ -4620,7 +4691,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  	/* Calc the number of the pages we need flush for space reservation */  	items = calc_reclaim_items_nr(root, to_reclaim); -	to_reclaim = items * EXTENT_SIZE_PER_ITEM; +	to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;  	trans = (struct btrfs_trans_handle *)current->journal_info;  	block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,7 +4703,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,  		if (trans)  			return;  		if (wait_ordered) -			btrfs_wait_ordered_roots(root->fs_info, items); +			btrfs_wait_ordered_roots(root->fs_info, items, +						 0, (u64)-1);  		return;  	} @@ -4671,7 +4743,8 @@ skip_async:  		loops++;  		if (wait_ordered && !trans) { -			btrfs_wait_ordered_roots(root->fs_info, items); +			btrfs_wait_ordered_roots(root->fs_info, items, +						 0, (u64)-1);  		} else {  			time_left = schedule_timeout_killable(1);  			if (time_left) @@ -4911,7 +4984,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)   * @orig_bytes - the number of bytes we want   * @flush - whether or not we can flush to make our reservation   * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated   * with the block_rsv.  If there is not enough space it will make an attempt to   * flush out space to make room.  It will do this by flushing delalloc if   * possible or committing the transaction.  If flush is 0 then no attempts to @@ -5516,7 +5589,7 @@ void btrfs_orphan_release_metadata(struct inode *inode)   * common file/directory operations, they change two fs/file trees   * and root tree, the number of items that the qgroup reserves is   * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction().   */  int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,  				     struct btrfs_block_rsv *rsv, @@ -5565,7 +5638,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,  /**   * drop_outstanding_extent - drop an outstanding extent   * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing.   *   * This is called when we are freeing up an outstanding extent, either called   * after an error or after an extent is written.  This will return the number of @@ -5591,7 +5664,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)  		drop_inode_space = 1;  	/* -	 * If we have more or the same amount of outsanding extents than we have +	 * If we have more or the same amount of outstanding extents than we have  	 * reserved then we need to leave the reserved extents count alone.  	 */  	if (BTRFS_I(inode)->outstanding_extents >= @@ -5605,8 +5678,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)  }  /** - * calc_csum_metadata_size - return the amount of metada space that must be - *	reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + *	reserved/freed for the given bytes.   * @inode: the inode we're manipulating   * @num_bytes: the number of bytes in question   * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5758,7 +5831,7 @@ out_fail:  		/*  		 * This is tricky, but first we need to figure out how much we -		 * free'd from any free-ers that occurred during this +		 * freed from any free-ers that occurred during this  		 * reservation, so we reset ->csum_bytes to the csum_bytes  		 * before we dropped our lock, and then call the free for the  		 * number of bytes that were freed while we were trying our @@ -5780,7 +5853,7 @@ out_fail:  		/*  		 * Now reset ->csum_bytes to what it should be.  If bytes is -		 * more than to_free then we would have free'd more space had we +		 * more than to_free then we would have freed more space had we  		 * not had an artificially high ->csum_bytes, so we need to free  		 * the remainder.  If bytes is the same or less then we don't  		 * need to do anything, the other free-ers did the correct @@ -6172,6 +6245,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,  	return 0;  } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ +	atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, +					const u64 start) +{ +	struct btrfs_block_group_cache *bg; + +	bg = btrfs_lookup_block_group(fs_info, start); +	ASSERT(bg); +	if (atomic_dec_and_test(&bg->reservations)) +		wake_up_atomic_t(&bg->reservations); +	btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ +	schedule(); +	return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ +	struct btrfs_space_info *space_info = bg->space_info; + +	ASSERT(bg->ro); + +	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) +		return; + +	/* +	 * Our block group is read only but before we set it to read only, +	 * some task might have had allocated an extent from it already, but it +	 * has not yet created a respective ordered extent (and added it to a +	 * root's list of ordered extents). +	 * Therefore wait for any task currently allocating extents, since the +	 * block group's reservations counter is incremented while a read lock +	 * on the groups' semaphore is held and decremented after releasing +	 * the read access on that semaphore and creating the ordered extent. +	 */ +	down_write(&space_info->groups_sem); +	up_write(&space_info->groups_sem); + +	wait_on_atomic_t(&bg->reservations, +			 btrfs_wait_bg_reservations_atomic_t, +			 TASK_UNINTERRUPTIBLE); +} +  /**   * btrfs_update_reserved_bytes - update the block_group and space info counters   * @cache:	The cache we are manipulating @@ -6408,7 +6532,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,  			ret = btrfs_discard_extent(root, start,  						   end + 1 - start, NULL); -		clear_extent_dirty(unpin, start, end, GFP_NOFS); +		clear_extent_dirty(unpin, start, end);  		unpin_extent_range(root, start, end, true);  		mutex_unlock(&fs_info->unused_bg_unpin_mutex);  		cond_resched(); @@ -7025,36 +7149,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,  		   int delalloc)  {  	struct btrfs_block_group_cache *used_bg = NULL; -	bool locked = false; -again: +  	spin_lock(&cluster->refill_lock); -	if (locked) { -		if (used_bg == cluster->block_group) +	while (1) { +		used_bg = cluster->block_group; +		if (!used_bg) +			return NULL; + +		if (used_bg == block_group)  			return used_bg; -		up_read(&used_bg->data_rwsem); -		btrfs_put_block_group(used_bg); -	} +		btrfs_get_block_group(used_bg); -	used_bg = cluster->block_group; -	if (!used_bg) -		return NULL; +		if (!delalloc) +			return used_bg; -	if (used_bg == block_group) -		return used_bg; +		if (down_read_trylock(&used_bg->data_rwsem)) +			return used_bg; -	btrfs_get_block_group(used_bg); +		spin_unlock(&cluster->refill_lock); -	if (!delalloc) -		return used_bg; +		down_read(&used_bg->data_rwsem); -	if (down_read_trylock(&used_bg->data_rwsem)) -		return used_bg; +		spin_lock(&cluster->refill_lock); +		if (used_bg == cluster->block_group) +			return used_bg; -	spin_unlock(&cluster->refill_lock); -	down_read(&used_bg->data_rwsem); -	locked = true; -	goto again; +		up_read(&used_bg->data_rwsem); +		btrfs_put_block_group(used_bg); +	}  }  static inline void @@ -7431,6 +7554,7 @@ checks:  			btrfs_add_free_space(block_group, offset, num_bytes);  			goto loop;  		} +		btrfs_inc_block_group_reservations(block_group);  		/* we are all good, lets return */  		ins->objectid = search_start; @@ -7471,7 +7595,7 @@ loop:  		if (loop == LOOP_CACHING_NOWAIT) {  			/*  			 * We want to skip the LOOP_CACHING_WAIT step if we -			 * don't have any unached bgs and we've alrelady done a +			 * don't have any uncached bgs and we've already done a  			 * full search through.  			 */  			if (orig_have_caching_bg || !full_search) @@ -7612,8 +7736,10 @@ again:  	WARN_ON(num_bytes < root->sectorsize);  	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,  			       flags, delalloc); - -	if (ret == -ENOSPC) { +	if (!ret && !is_data) { +		btrfs_dec_block_group_reservations(root->fs_info, +						   ins->objectid); +	} else if (ret == -ENOSPC) {  		if (!final_tried && ins->offset) {  			num_bytes = min(num_bytes >> 1, ins->offset);  			num_bytes = round_down(num_bytes, root->sectorsize); @@ -7873,7 +7999,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,  	/*  	 * Mixed block groups will exclude before processing the log so we only -	 * need to do the exlude dance if this fs isn't mixed. +	 * need to do the exclude dance if this fs isn't mixed.  	 */  	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {  		ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -7901,8 +8027,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,  	struct extent_buffer *buf;  	buf = btrfs_find_create_tree_block(root, bytenr); -	if (!buf) -		return ERR_PTR(-ENOMEM); +	if (IS_ERR(buf)) +		return buf; +  	btrfs_set_header_generation(buf, trans->transid);  	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);  	btrfs_tree_lock(buf); @@ -7923,13 +8050,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,  					buf->start + buf->len - 1, GFP_NOFS);  		else  			set_extent_new(&root->dirty_log_pages, buf->start, -					buf->start + buf->len - 1, GFP_NOFS); +					buf->start + buf->len - 1);  	} else {  		buf->log_index = -1;  		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,  			 buf->start + buf->len - 1, GFP_NOFS);  	} -	trans->blocks_used++; +	trans->dirty = true;  	/* this returns a buffer locked for blocking */  	return buf;  } @@ -8544,8 +8671,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,  	next = btrfs_find_tree_block(root->fs_info, bytenr);  	if (!next) {  		next = btrfs_find_create_tree_block(root, bytenr); -		if (!next) -			return -ENOMEM; +		if (IS_ERR(next)) +			return PTR_ERR(next); +  		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,  					       level - 1);  		reada = 1; @@ -9058,7 +9186,7 @@ out:  	if (!for_reloc && root_dropped == false)  		btrfs_add_dead_root(root);  	if (err && err != -EAGAIN) -		btrfs_std_error(root->fs_info, err, NULL); +		btrfs_handle_fs_error(root->fs_info, err, NULL);  	return err;  } @@ -9317,7 +9445,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)  	u64 free_bytes = 0;  	int factor; -	/* It's df, we don't care if it's racey */ +	/* It's df, we don't care if it's racy */  	if (list_empty(&sinfo->ro_bgs))  		return 0; @@ -10526,14 +10654,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)  		 */  		mutex_lock(&fs_info->unused_bg_unpin_mutex);  		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, -				  EXTENT_DIRTY, GFP_NOFS); +				  EXTENT_DIRTY);  		if (ret) {  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			btrfs_dec_block_group_ro(root, block_group);  			goto end_trans;  		}  		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, -				  EXTENT_DIRTY, GFP_NOFS); +				  EXTENT_DIRTY);  		if (ret) {  			mutex_unlock(&fs_info->unused_bg_unpin_mutex);  			btrfs_dec_block_group_ro(root, block_group);  |