diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 296 | 
1 files changed, 144 insertions, 152 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ab8f970b2849..afe6af9c0138 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -37,6 +37,7 @@  #include <linux/pci-p2pdma.h>  #include <drm/drm_atomic_helper.h> +#include <drm/drm_fb_helper.h>  #include <drm/drm_probe_helper.h>  #include <drm/amdgpu_drm.h>  #include <linux/vgaarb.h> @@ -1568,7 +1569,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)   * @pdev: pci dev pointer   * @state: vga_switcheroo state   * - * Callback for the switcheroo driver.  Suspends or resumes the + * Callback for the switcheroo driver.  Suspends or resumes   * the asics before or after it is powered up using ACPI methods.   */  static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, @@ -1915,6 +1916,16 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)  	}  } +void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) +{ +	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { +		adev->mode_info.num_crtc = 1; +		adev->enable_virtual_display = true; +		DRM_INFO("virtual_display:%d, num_crtc:%d\n", +			 adev->enable_virtual_display, adev->mode_info.num_crtc); +	} +} +  /**   * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware   * @@ -2397,7 +2408,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  			adev->ip_blocks[i].status.hw = true;  			/* right after GMC hw init, we create CSA */ -			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { +			if (amdgpu_mcbp) {  				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,  								AMDGPU_GEM_DOMAIN_VRAM,  								AMDGPU_CSA_SIZE); @@ -2462,6 +2473,11 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  			if (!amdgpu_sriov_vf(adev)) {  				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +				if (WARN_ON(!hive)) { +					r = -ENOENT; +					goto init_failed; +				} +  				if (!hive->reset_domain ||  				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {  					r = -ENOENT; @@ -2928,6 +2944,14 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)  	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);  	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); +	/* +	 * Per PMFW team's suggestion, driver needs to handle gfxoff +	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) +	 * scenario. Add the missing df cstate disablement here. +	 */ +	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) +		dev_warn(adev->dev, "Failed to disallow df cstate"); +  	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {  		if (!adev->ip_blocks[i].status.valid)  			continue; @@ -2992,14 +3016,15 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)  			continue;  		} -		/* skip suspend of gfx and psp for S0ix +		/* skip suspend of gfx/mes and psp for S0ix  		 * gfx is in gfxoff state, so on resume it will exit gfxoff just  		 * like at runtime. PSP is also part of the always on hardware  		 * so no need to suspend it.  		 */  		if (adev->in_s0ix &&  		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || -		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))  			continue;  		/* XXX handle errors */ @@ -3202,6 +3227,15 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)  			return r;  		}  		adev->ip_blocks[i].status.hw = true; + +		if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { +			/* disable gfxoff for IP resume. The gfxoff will be re-enabled in +			 * amdgpu_device_resume() after IP resume. +			 */ +			amdgpu_gfx_off_ctrl(adev, false); +			DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); +		} +  	}  	return 0; @@ -3330,8 +3364,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)   */  bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)  { -	if (amdgpu_sriov_vf(adev) || -	    adev->enable_virtual_display || +	if (adev->enable_virtual_display ||  	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))  		return false; @@ -4043,15 +4076,18 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)   * at suspend time.   *   */ -static void amdgpu_device_evict_resources(struct amdgpu_device *adev) +static int amdgpu_device_evict_resources(struct amdgpu_device *adev)  { +	int ret; +  	/* No need to evict vram on APUs for suspend to ram or s2idle */  	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) -		return; +		return 0; -	if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) +	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); +	if (ret)  		DRM_WARN("evicting device resources failed\n"); - +	return ret;  }  /* @@ -4077,6 +4113,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	adev->in_suspend = true; +	/* Evict the majority of BOs before grabbing the full access */ +	r = amdgpu_device_evict_resources(adev); +	if (r) +		return r; +  	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_fini_data_exchange(adev);  		r = amdgpu_virt_request_full_gpu(adev, false); @@ -4101,7 +4142,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	if (!adev->in_s0ix)  		amdgpu_amdkfd_suspend(adev, adev->in_runpm); -	amdgpu_device_evict_resources(adev); +	r = amdgpu_device_evict_resources(adev); +	if (r) +		return r;  	amdgpu_fence_driver_hw_fini(adev); @@ -4149,21 +4192,15 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	r = amdgpu_device_ip_resume(adev); -	/* no matter what r is, always need to properly release full GPU */ -	if (amdgpu_sriov_vf(adev)) { -		amdgpu_virt_init_data_exchange(adev); -		amdgpu_virt_release_full_gpu(adev, true); -	} -  	if (r) {  		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); -		return r; +		goto exit;  	}  	amdgpu_fence_driver_hw_init(adev);  	r = amdgpu_device_ip_late_init(adev);  	if (r) -		return r; +		goto exit;  	queue_delayed_work(system_wq, &adev->delayed_init_work,  			   msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -4171,12 +4208,28 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	if (!adev->in_s0ix) {  		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);  		if (r) -			return r; +			goto exit;  	} +exit: +	if (amdgpu_sriov_vf(adev)) { +		amdgpu_virt_init_data_exchange(adev); +		amdgpu_virt_release_full_gpu(adev, true); +	} + +	if (r) +		return r; +  	/* Make sure IB tests flushed */  	flush_delayed_work(&adev->delayed_init_work); +	if (adev->in_s0ix) { +		/* re-enable gfxoff after IP resume. This re-enables gfxoff after +		 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). +		 */ +		amdgpu_gfx_off_ctrl(adev, true); +		DRM_DEBUG("will enable gfxoff for the mission mode\n"); +	}  	if (fbcon)  		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); @@ -4184,25 +4237,27 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	amdgpu_ras_resume(adev); -	/* -	 * Most of the connector probing functions try to acquire runtime pm -	 * refs to ensure that the GPU is powered on when connector polling is -	 * performed. Since we're calling this from a runtime PM callback, -	 * trying to acquire rpm refs will cause us to deadlock. -	 * -	 * Since we're guaranteed to be holding the rpm lock, it's safe to -	 * temporarily disable the rpm helpers so this doesn't deadlock us. -	 */ +	if (adev->mode_info.num_crtc) { +		/* +		 * Most of the connector probing functions try to acquire runtime pm +		 * refs to ensure that the GPU is powered on when connector polling is +		 * performed. Since we're calling this from a runtime PM callback, +		 * trying to acquire rpm refs will cause us to deadlock. +		 * +		 * Since we're guaranteed to be holding the rpm lock, it's safe to +		 * temporarily disable the rpm helpers so this doesn't deadlock us. +		 */  #ifdef CONFIG_PM -	dev->dev->power.disable_depth++; +		dev->dev->power.disable_depth++;  #endif -	if (!amdgpu_device_has_dc_support(adev)) -		drm_helper_hpd_irq_event(dev); -	else -		drm_kms_helper_hotplug_event(dev); +		if (!adev->dc_enabled) +			drm_helper_hpd_irq_event(dev); +		else +			drm_kms_helper_hotplug_event(dev);  #ifdef CONFIG_PM -	dev->dev->power.disable_depth--; +		dev->dev->power.disable_depth--;  #endif +	}  	adev->in_suspend = false;  	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) @@ -4551,6 +4606,10 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)  	if (amdgpu_gpu_recovery == 0)  		goto disabled; +	/* Skip soft reset check in fatal error mode */ +	if (!amdgpu_ras_is_poison_mode_supported(adev)) +		return true; +  	if (!amdgpu_device_ip_check_soft_reset(adev)) {  		dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");  		return false; @@ -4998,6 +5057,8 @@ static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)  		pm_runtime_enable(&(p->dev));  		pm_runtime_resume(&(p->dev));  	} + +	pci_dev_put(p);  }  static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) @@ -5036,6 +5097,7 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)  		if (expires < ktime_get_mono_fast_ns()) {  			dev_warn(adev->dev, "failed to suspend display audio\n"); +			pci_dev_put(p);  			/* TODO: abort the succeeding gpu reset? */  			return -ETIMEDOUT;  		} @@ -5043,97 +5105,10 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)  	pm_runtime_disable(&(p->dev)); +	pci_dev_put(p);  	return 0;  } -static void amdgpu_device_recheck_guilty_jobs( -	struct amdgpu_device *adev, struct list_head *device_list_handle, -	struct amdgpu_reset_context *reset_context) -{ -	int i, r = 0; - -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { -		struct amdgpu_ring *ring = adev->rings[i]; -		int ret = 0; -		struct drm_sched_job *s_job; - -		if (!ring || !ring->sched.thread) -			continue; - -		s_job = list_first_entry_or_null(&ring->sched.pending_list, -				struct drm_sched_job, list); -		if (s_job == NULL) -			continue; - -		/* clear job's guilty and depend the folowing step to decide the real one */ -		drm_sched_reset_karma(s_job); -		drm_sched_resubmit_jobs_ext(&ring->sched, 1); - -		if (!s_job->s_fence->parent) { -			DRM_WARN("Failed to get a HW fence for job!"); -			continue; -		} - -		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); -		if (ret == 0) { /* timeout */ -			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", -						ring->sched.name, s_job->id); - - -			amdgpu_fence_driver_isr_toggle(adev, true); - -			/* Clear this failed job from fence array */ -			amdgpu_fence_driver_clear_job_fences(ring); - -			amdgpu_fence_driver_isr_toggle(adev, false); - -			/* Since the job won't signal and we go for -			 * another resubmit drop this parent pointer -			 */ -			dma_fence_put(s_job->s_fence->parent); -			s_job->s_fence->parent = NULL; - -			/* set guilty */ -			drm_sched_increase_karma(s_job); -			amdgpu_reset_prepare_hwcontext(adev, reset_context); -retry: -			/* do hw reset */ -			if (amdgpu_sriov_vf(adev)) { -				amdgpu_virt_fini_data_exchange(adev); -				r = amdgpu_device_reset_sriov(adev, false); -				if (r) -					adev->asic_reset_res = r; -			} else { -				clear_bit(AMDGPU_SKIP_HW_RESET, -					  &reset_context->flags); -				r = amdgpu_do_asic_reset(device_list_handle, -							 reset_context); -				if (r && r == -EAGAIN) -					goto retry; -			} - -			/* -			 * add reset counter so that the following -			 * resubmitted job could flush vmid -			 */ -			atomic_inc(&adev->gpu_reset_counter); -			continue; -		} - -		/* got the hw fence, signal finished fence */ -		atomic_dec(ring->sched.score); -		dma_fence_get(&s_job->s_fence->finished); -		dma_fence_signal(&s_job->s_fence->finished); -		dma_fence_put(&s_job->s_fence->finished); - -		/* remove node from list and free the job */ -		spin_lock(&ring->sched.job_list_lock); -		list_del_init(&s_job->list); -		spin_unlock(&ring->sched.job_list_lock); -		ring->sched.ops->free_job(s_job); -	} -} -  static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -5154,7 +5129,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  } -  /**   * amdgpu_device_gpu_recover - reset the asic and recover scheduler   * @@ -5177,7 +5151,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	int i, r = 0;  	bool need_emergency_restart = false;  	bool audio_suspended = false; -	int tmp_vram_lost_counter;  	bool gpu_reset_for_dev_remove = false;  	gpu_reset_for_dev_remove = @@ -5210,7 +5183,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	reset_context->job = job;  	reset_context->hive = hive; -  	/*  	 * Build list of devices to reset.  	 * In case we are in XGMI hive mode, resort the device list @@ -5324,7 +5296,6 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  		amdgpu_device_stop_pending_resets(tmp_adev);  	} -	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));  	/* Actual ASIC resets if needed.*/  	/* Host driver will handle XGMI hive reset for SRIOV */  	if (amdgpu_sriov_vf(adev)) { @@ -5337,11 +5308,8 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  			amdgpu_ras_resume(adev);  	} else {  		r = amdgpu_do_asic_reset(device_list_handle, reset_context); -		if (r && r == -EAGAIN) { -			set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags); -			adev->asic_reset_res = 0; +		if (r && r == -EAGAIN)  			goto retry; -		}  		if (!r && gpu_reset_for_dev_remove)  			goto recover_end; @@ -5352,32 +5320,16 @@ skip_hw_reset:  	/* Post ASIC reset for all devs .*/  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { -		/* -		 * Sometimes a later bad compute job can block a good gfx job as gfx -		 * and compute ring share internal GC HW mutually. We add an additional -		 * guilty jobs recheck step to find the real guilty job, it synchronously -		 * submits and pends for the first job being signaled. If it gets timeout, -		 * we identify it as a real guilty job. -		 */ -		if (amdgpu_gpu_recovery == 2 && -			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) -			amdgpu_device_recheck_guilty_jobs( -				tmp_adev, device_list_handle, reset_context); -  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = tmp_adev->rings[i];  			if (!ring || !ring->sched.thread)  				continue; -			/* No point to resubmit jobs if we didn't HW reset*/ -			if (!tmp_adev->asic_reset_res && !job_signaled) -				drm_sched_resubmit_jobs(&ring->sched); - -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); +			drm_sched_start(&ring->sched, true);  		} -		if (adev->enable_mes) +		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))  			amdgpu_mes_self_test(tmp_adev);  		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { @@ -5416,6 +5368,8 @@ skip_sched_resume:  			amdgpu_device_resume_display_audio(tmp_adev);  		amdgpu_device_unset_mp1_state(tmp_adev); + +		amdgpu_ras_set_error_query_ready(tmp_adev, true);  	}  recover_end: @@ -5777,7 +5731,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)  	reset_context.reset_req_dev = adev;  	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);  	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); -	set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);  	adev->no_hw_access = true;  	r = amdgpu_device_pre_asic_reset(adev, &reset_context); @@ -5828,8 +5781,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)  		if (!ring || !ring->sched.thread)  			continue; - -		drm_sched_resubmit_jobs(&ring->sched);  		drm_sched_start(&ring->sched, true);  	} @@ -6020,3 +5971,44 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,  	dma_fence_put(old);  	return NULL;  } + +bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) +{ +	switch (adev->asic_type) { +#ifdef CONFIG_DRM_AMDGPU_SI +	case CHIP_HAINAN: +#endif +	case CHIP_TOPAZ: +		/* chips with no display hardware */ +		return false; +#ifdef CONFIG_DRM_AMDGPU_SI +	case CHIP_TAHITI: +	case CHIP_PITCAIRN: +	case CHIP_VERDE: +	case CHIP_OLAND: +#endif +#ifdef CONFIG_DRM_AMDGPU_CIK +	case CHIP_BONAIRE: +	case CHIP_HAWAII: +	case CHIP_KAVERI: +	case CHIP_KABINI: +	case CHIP_MULLINS: +#endif +	case CHIP_TONGA: +	case CHIP_FIJI: +	case CHIP_POLARIS10: +	case CHIP_POLARIS11: +	case CHIP_POLARIS12: +	case CHIP_VEGAM: +	case CHIP_CARRIZO: +	case CHIP_STONEY: +		/* chips with display hardware */ +		return true; +	default: +		/* IP discovery */ +		if (!adev->ip_versions[DCE_HWIP][0] || +		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) +			return false; +		return true; +	} +} |