diff options
| author | Liu Bo <[email protected]> | 2015-07-17 16:49:19 +0800 | 
|---|---|---|
| committer | David Sterba <[email protected]> | 2016-02-23 13:10:10 +0100 | 
| commit | 73beece9ca07c003e0e4f4825b12be167334d4ad (patch) | |
| tree | b0575466fc5e0c7a493826963e7a4ffa49e31210 /fs/btrfs/dev-replace.c | |
| parent | 18558cae0272f8fd9647e69d3fec1565a7949865 (diff) | |
Btrfs: fix lockdep deadlock warning due to dev_replace
Xfstests btrfs/011 complains about a deadlock warning,
[ 1226.649039] =========================================================
[ 1226.649039] [ INFO: possible irq lock inversion dependency detected ]
[ 1226.649039] 4.1.0+ #270 Not tainted
[ 1226.649039] ---------------------------------------------------------
[ 1226.652955] kswapd0/46 just changed the state of lock:
[ 1226.652955]  (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past:
[ 1226.652955]  (&fs_info->dev_replace.lock){+.+.+.}
and interrupts could create inverse lock ordering between them.
[ 1226.652955]
other info that might help us debug this:
[ 1226.652955] Chain exists of:
  &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock
[ 1226.652955]  Possible interrupt unsafe locking scenario:
[ 1226.652955]        CPU0                    CPU1
[ 1226.652955]        ----                    ----
[ 1226.652955]   lock(&fs_info->dev_replace.lock);
[ 1226.652955]                                local_irq_disable();
[ 1226.652955]                                lock(&delayed_node->mutex);
[ 1226.652955]                                lock(&found->groups_sem);
[ 1226.652955]   <Interrupt>
[ 1226.652955]     lock(&delayed_node->mutex);
[ 1226.652955]
 *** DEADLOCK ***
Commit 084b6e7c7607 ("btrfs: Fix a lockdep warning when running xfstest.") tried
to fix a similar one that has the exactly same warning, but with that, we still
run to this.
The above lock chain comes from
btrfs_commit_transaction
  ->btrfs_run_delayed_items
    ...
    ->__btrfs_update_delayed_inode
      ...
      ->__btrfs_cow_block
         ...
         ->find_free_extent
            ->cache_block_group
              ->load_free_space_cache
                ->btrfs_readpages
                  ->submit_one_bio
                    ...
                    ->__btrfs_map_block
                      ->btrfs_dev_replace_lock
However, with high memory pressure, tasks which hold dev_replace.lock can
be interrupted by kswapd and then kswapd is intended to release memory occupied
by superblock, inodes and dentries, where we may call evict_inode, and it comes
to
[ 1226.652955]  [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955]  [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30
[ 1226.652955]  [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700
delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads
to a ABBA deadlock.
To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but
things are simpler here since we only needs read's spinlock to blocking lock.
With this, btrfs/011 no more produces warnings in dmesg.
Signed-off-by: Liu Bo <[email protected]>
Signed-off-by: David Sterba <[email protected]>
Diffstat (limited to 'fs/btrfs/dev-replace.c')
| -rw-r--r-- | fs/btrfs/dev-replace.c | 130 | 
1 files changed, 70 insertions, 60 deletions
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index cbb7dbfb3fff..8c8b48971bc7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	struct btrfs_dev_replace_item *ptr;  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 0);  	if (!dev_replace->is_valid ||  	    !dev_replace->item_needs_writeback) { -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 0);  		return 0;  	} -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 0);  	key.objectid = 0;  	key.type = BTRFS_DEV_REPLACE_KEY; @@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	ptr = btrfs_item_ptr(eb, path->slots[0],  			     struct btrfs_dev_replace_item); -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	if (dev_replace->srcdev)  		btrfs_set_dev_replace_src_devid(eb, ptr,  			dev_replace->srcdev->devid); @@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,  	btrfs_set_dev_replace_cursor_right(eb, ptr,  		dev_replace->cursor_right);  	dev_replace->item_needs_writeback = 0; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	btrfs_mark_buffer_dirty(eb); @@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,  		return PTR_ERR(trans);  	} -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,  	dev_replace->is_valid = 1;  	dev_replace->item_needs_writeback = 1;  	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);  	if (ret) @@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,  	trans = btrfs_start_transaction(root, 0);  	if (IS_ERR(trans)) {  		ret = PTR_ERR(trans); -		btrfs_dev_replace_lock(dev_replace); +		btrfs_dev_replace_lock(dev_replace, 1);  		goto leave;  	} @@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,  leave:  	dev_replace->srcdev = NULL;  	dev_replace->tgtdev = NULL; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);  	return ret;  } @@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	/* don't allow cancel or unmount to disturb the finishing procedure */  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 0);  	/* was the operation canceled, or is it finished? */  	if (dev_replace->replace_state !=  	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 0);  		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  		return 0;  	}  	tgt_device = dev_replace->tgtdev;  	src_device = dev_replace->srcdev; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 0);  	/*  	 * flush all outstanding I/O and inode extent mappings before the @@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	/* keep away write_all_supers() during the finishing procedure */  	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);  	mutex_lock(&root->fs_info->chunk_mutex); -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	dev_replace->replace_state =  		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED  			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  			        rcu_str_deref(src_device->name),  			      src_device->devid,  			      rcu_str_deref(tgt_device->name), scrub_ret); -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 1);  		mutex_unlock(&root->fs_info->chunk_mutex);  		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);  		mutex_unlock(&uuid_mutex); @@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,  	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);  	fs_info->fs_devices->rw_devices++; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	btrfs_rm_dev_replace_blocked(fs_info); @@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	struct btrfs_device *srcdev; -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 0);  	/* even if !dev_replace_is_valid, the values are good enough for  	 * the replace_status ioctl */  	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,  			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));  		break;  	} -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 0);  }  int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, @@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)  		return -EROFS;  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:  		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 1);  		goto leave;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: @@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)  	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;  	dev_replace->time_stopped = get_seconds();  	dev_replace->item_needs_writeback = 1; -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	btrfs_scrub_cancel(fs_info);  	trans = btrfs_start_transaction(root, 0); @@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;  	mutex_lock(&dev_replace->lock_finishing_cancel_unmount); -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)  		break;  	} -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);  } @@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)  	struct task_struct *task;  	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; -	btrfs_dev_replace_lock(dev_replace); +	btrfs_dev_replace_lock(dev_replace, 1);  	switch (dev_replace->replace_state) {  	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:  	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 1);  		return 0;  	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:  		break; @@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)  		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");  		btrfs_info(fs_info,  			"you may cancel the operation after 'mount -o degraded'"); -		btrfs_dev_replace_unlock(dev_replace); +		btrfs_dev_replace_unlock(dev_replace, 1);  		return 0;  	} -	btrfs_dev_replace_unlock(dev_replace); +	btrfs_dev_replace_unlock(dev_replace, 1);  	WARN_ON(atomic_xchg(  		&fs_info->mutually_exclusive_operation_running, 1)); @@ -865,48 +865,58 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)  	return 1;  } -void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace) +void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)  { -	/* the beginning is just an optimization for the typical case */ -	if (atomic_read(&dev_replace->nesting_level) == 0) { -acquire_lock: -		/* this is not a nested case where the same thread -		 * is trying to acqurire the same lock twice */ -		mutex_lock(&dev_replace->lock); -		mutex_lock(&dev_replace->lock_management_lock); -		dev_replace->lock_owner = current->pid; -		atomic_inc(&dev_replace->nesting_level); -		mutex_unlock(&dev_replace->lock_management_lock); -		return; +	if (rw == 1) { +		/* write */ +again: +		wait_event(dev_replace->read_lock_wq, +			   atomic_read(&dev_replace->blocking_readers) == 0); +		write_lock(&dev_replace->lock); +		if (atomic_read(&dev_replace->blocking_readers)) { +			write_unlock(&dev_replace->lock); +			goto again; +		} +	} else { +		read_lock(&dev_replace->lock); +		atomic_inc(&dev_replace->read_locks);  	} +} -	mutex_lock(&dev_replace->lock_management_lock); -	if (atomic_read(&dev_replace->nesting_level) > 0 && -	    dev_replace->lock_owner == current->pid) { -		WARN_ON(!mutex_is_locked(&dev_replace->lock)); -		atomic_inc(&dev_replace->nesting_level); -		mutex_unlock(&dev_replace->lock_management_lock); -		return; +void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw) +{ +	if (rw == 1) { +		/* write */ +		ASSERT(atomic_read(&dev_replace->blocking_readers) == 0); +		write_unlock(&dev_replace->lock); +	} else { +		ASSERT(atomic_read(&dev_replace->read_locks) > 0); +		atomic_dec(&dev_replace->read_locks); +		read_unlock(&dev_replace->lock);  	} +} -	mutex_unlock(&dev_replace->lock_management_lock); -	goto acquire_lock; +/* inc blocking cnt and release read lock */ +void btrfs_dev_replace_set_lock_blocking( +					struct btrfs_dev_replace *dev_replace) +{ +	/* only set blocking for read lock */ +	ASSERT(atomic_read(&dev_replace->read_locks) > 0); +	atomic_inc(&dev_replace->blocking_readers); +	read_unlock(&dev_replace->lock);  } -void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) +/* acquire read lock and dec blocking cnt */ +void btrfs_dev_replace_clear_lock_blocking( +					struct btrfs_dev_replace *dev_replace)  { -	WARN_ON(!mutex_is_locked(&dev_replace->lock)); -	mutex_lock(&dev_replace->lock_management_lock); -	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1); -	WARN_ON(dev_replace->lock_owner != current->pid); -	atomic_dec(&dev_replace->nesting_level); -	if (atomic_read(&dev_replace->nesting_level) == 0) { -		dev_replace->lock_owner = 0; -		mutex_unlock(&dev_replace->lock_management_lock); -		mutex_unlock(&dev_replace->lock); -	} else { -		mutex_unlock(&dev_replace->lock_management_lock); -	} +	/* only set blocking for read lock */ +	ASSERT(atomic_read(&dev_replace->read_locks) > 0); +	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); +	read_lock(&dev_replace->lock); +	if (atomic_dec_and_test(&dev_replace->blocking_readers) && +	    waitqueue_active(&dev_replace->read_lock_wq)) +		wake_up(&dev_replace->read_lock_wq);  }  void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)  |