diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 149 | 
1 files changed, 126 insertions, 23 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index be7aff2d4a57..ab8f970b2849 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2459,19 +2459,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  	 */  	if (adev->gmc.xgmi.num_physical_nodes > 1) {  		if (amdgpu_xgmi_add_device(adev) == 0) { -			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +			if (!amdgpu_sriov_vf(adev)) { +				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); -			if (!hive->reset_domain || -			    !amdgpu_reset_get_reset_domain(hive->reset_domain)) { -				r = -ENOENT; +				if (!hive->reset_domain || +				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) { +					r = -ENOENT; +					amdgpu_put_xgmi_hive(hive); +					goto init_failed; +				} + +				/* Drop the early temporary reset domain we created for device */ +				amdgpu_reset_put_reset_domain(adev->reset_domain); +				adev->reset_domain = hive->reset_domain;  				amdgpu_put_xgmi_hive(hive); -				goto init_failed;  			} - -			/* Drop the early temporary reset domain we created for device */ -			amdgpu_reset_put_reset_domain(adev->reset_domain); -			adev->reset_domain = hive->reset_domain; -			amdgpu_put_xgmi_hive(hive);  		}  	} @@ -3152,7 +3154,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)  			continue;  		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||  		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || -		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { +		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || +		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {  			r = adev->ip_blocks[i].version->funcs->resume(adev);  			if (r) { @@ -3509,6 +3512,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	adev->gmc.gart_size = 512 * 1024 * 1024;  	adev->accel_working = false;  	adev->num_rings = 0; +	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());  	adev->mman.buffer_funcs = NULL;  	adev->mman.buffer_funcs_ring = NULL;  	adev->vm_manager.vm_pte_funcs = NULL; @@ -3587,6 +3591,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);  	adev->gfx.gfx_off_req_count = 1; +	adev->gfx.gfx_off_residency = 0; +	adev->gfx.gfx_off_entrycount = 0;  	adev->pm.ac_power = power_supply_is_system_supplied() > 0;  	atomic_set(&adev->throttling_logging_enabled, 1); @@ -3975,8 +3981,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  	amdgpu_gart_dummy_page_fini(adev); -	if (drm_dev_is_unplugged(adev_to_drm(adev))) -		amdgpu_device_unmap_mmio(adev); +	amdgpu_device_unmap_mmio(adev);  } @@ -3989,6 +3994,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)  	release_firmware(adev->firmware.gpu_info_fw);  	adev->firmware.gpu_info_fw = NULL;  	adev->accel_working = false; +	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));  	amdgpu_reset_fini(adev); @@ -4064,12 +4070,20 @@ static void amdgpu_device_evict_resources(struct amdgpu_device *adev)  int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  {  	struct amdgpu_device *adev = drm_to_adev(dev); +	int r = 0;  	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)  		return 0;  	adev->in_suspend = true; +	if (amdgpu_sriov_vf(adev)) { +		amdgpu_virt_fini_data_exchange(adev); +		r = amdgpu_virt_request_full_gpu(adev, false); +		if (r) +			return r; +	} +  	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))  		DRM_WARN("smart shift update failed\n"); @@ -4093,6 +4107,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	amdgpu_device_ip_suspend_phase2(adev); +	if (amdgpu_sriov_vf(adev)) +		amdgpu_virt_release_full_gpu(adev, false); +  	return 0;  } @@ -4111,6 +4128,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	struct amdgpu_device *adev = drm_to_adev(dev);  	int r = 0; +	if (amdgpu_sriov_vf(adev)) { +		r = amdgpu_virt_request_full_gpu(adev, true); +		if (r) +			return r; +	} +  	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)  		return 0; @@ -4125,6 +4148,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	}  	r = amdgpu_device_ip_resume(adev); + +	/* no matter what r is, always need to properly release full GPU */ +	if (amdgpu_sriov_vf(adev)) { +		amdgpu_virt_init_data_exchange(adev); +		amdgpu_virt_release_full_gpu(adev, true); +	} +  	if (r) {  		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);  		return r; @@ -4517,14 +4547,15 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)   */  bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)  { -	if (!amdgpu_device_ip_check_soft_reset(adev)) { -		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); -		return false; -	}  	if (amdgpu_gpu_recovery == 0)  		goto disabled; +	if (!amdgpu_device_ip_check_soft_reset(adev)) { +		dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); +		return false; +	} +  	if (amdgpu_sriov_vf(adev))  		return true; @@ -4649,7 +4680,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  		if (!need_full_reset)  			need_full_reset = amdgpu_device_ip_need_full_reset(adev); -		if (!need_full_reset) { +		if (!need_full_reset && amdgpu_gpu_recovery) {  			amdgpu_device_ip_pre_soft_reset(adev);  			r = amdgpu_device_ip_soft_reset(adev);  			amdgpu_device_ip_post_soft_reset(adev); @@ -4745,6 +4776,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  	struct amdgpu_device *tmp_adev = NULL;  	bool need_full_reset, skip_hw_reset, vram_lost = false;  	int r = 0; +	bool gpu_reset_for_dev_remove = 0;  	/* Try reset handler method first */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -4764,6 +4796,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);  	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); +	gpu_reset_for_dev_remove = +		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && +			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); +  	/*  	 * ASIC reset has to be done on all XGMI hive nodes ASAP  	 * to allow proper links negotiation in FW (within 1 sec) @@ -4808,6 +4844,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  		amdgpu_ras_intr_cleared();  	} +	/* Since the mode1 reset affects base ip blocks, the +	 * phase1 ip blocks need to be resumed. Otherwise there +	 * will be a BIOS signature error and the psp bootloader +	 * can't load kdb on the next amdgpu install. +	 */ +	if (gpu_reset_for_dev_remove) { +		list_for_each_entry(tmp_adev, device_list_handle, reset_list) +			amdgpu_device_ip_resume_phase1(tmp_adev); + +		goto end; +	} +  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {  		if (need_full_reset) {  			/* post card */ @@ -5047,6 +5095,7 @@ static void amdgpu_device_recheck_guilty_jobs(  			/* set guilty */  			drm_sched_increase_karma(s_job); +			amdgpu_reset_prepare_hwcontext(adev, reset_context);  retry:  			/* do hw reset */  			if (amdgpu_sriov_vf(adev)) { @@ -5129,6 +5178,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	bool need_emergency_restart = false;  	bool audio_suspended = false;  	int tmp_vram_lost_counter; +	bool gpu_reset_for_dev_remove = false; + +	gpu_reset_for_dev_remove = +			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && +				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);  	/*  	 * Special case: RAS triggered and full reset isn't supported @@ -5156,6 +5210,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	reset_context->job = job;  	reset_context->hive = hive; +  	/*  	 * Build list of devices to reset.  	 * In case we are in XGMI hive mode, resort the device list @@ -5163,8 +5218,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 */  	INIT_LIST_HEAD(&device_list);  	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { -		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {  			list_add_tail(&tmp_adev->reset_list, &device_list); +			if (gpu_reset_for_dev_remove && adev->shutdown) +				tmp_adev->shutdown = true; +		}  		if (!list_is_first(&adev->reset_list, &device_list))  			list_rotate_to_front(&adev->reset_list, &device_list);  		device_list_handle = &device_list; @@ -5247,6 +5305,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  retry:	/* Rest of adevs pre asic reset from XGMI hive. */  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { +		if (gpu_reset_for_dev_remove) { +			/* Workaroud for ASICs need to disable SMC first */ +			amdgpu_device_smu_fini_early(tmp_adev); +		}  		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);  		/*TODO Should we stop ?*/  		if (r) { @@ -5275,8 +5337,14 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  			amdgpu_ras_resume(adev);  	} else {  		r = amdgpu_do_asic_reset(device_list_handle, reset_context); -		if (r && r == -EAGAIN) +		if (r && r == -EAGAIN) { +			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags); +			adev->asic_reset_res = 0;  			goto retry; +		} + +		if (!r && gpu_reset_for_dev_remove) +			goto recover_end;  	}  skip_hw_reset: @@ -5350,6 +5418,7 @@ skip_sched_resume:  		amdgpu_device_unset_mp1_state(tmp_adev);  	} +recover_end:  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  					    reset_list);  	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); @@ -5532,9 +5601,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,  		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);  	resource_size_t aper_limit =  		adev->gmc.aper_base + adev->gmc.aper_size - 1; -	bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && -			  !(pci_p2pdma_distance_many(adev->pdev, -					&peer_adev->dev, 1, true) < 0); +	bool p2p_access = +		!adev->gmc.xgmi.connected_to_cpu && +		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);  	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&  		adev->gmc.real_vram_size == adev->gmc.visible_vram_size && @@ -5708,6 +5777,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)  	reset_context.reset_req_dev = adev;  	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);  	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); +	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);  	adev->no_hw_access = true;  	r = amdgpu_device_pre_asic_reset(adev, &reset_context); @@ -5917,3 +5987,36 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,  	(void)RREG32(data);  	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);  } + +/** + * amdgpu_device_switch_gang - switch to a new gang + * @adev: amdgpu_device pointer + * @gang: the gang to switch to + * + * Try to switch to a new gang. + * Returns: NULL if we switched to the new gang or a reference to the current + * gang leader. + */ +struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, +					    struct dma_fence *gang) +{ +	struct dma_fence *old = NULL; + +	do { +		dma_fence_put(old); +		rcu_read_lock(); +		old = dma_fence_get_rcu_safe(&adev->gang_submit); +		rcu_read_unlock(); + +		if (old == gang) +			break; + +		if (!dma_fence_is_signaled(old)) +			return old; + +	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, +			 old, gang) != old); + +	dma_fence_put(old); +	return NULL; +}  |