diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 338 | 
1 files changed, 137 insertions, 201 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f1e9663b4051..c4a4e2fe6681 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -36,7 +36,10 @@  #include <generated/utsrelease.h>  #include <linux/pci-p2pdma.h> +#include <drm/drm_aperture.h>  #include <drm/drm_atomic_helper.h> +#include <drm/drm_crtc_helper.h> +#include <drm/drm_fb_helper.h>  #include <drm/drm_probe_helper.h>  #include <drm/amdgpu_drm.h>  #include <linux/vgaarb.h> @@ -89,6 +92,8 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");  #define AMDGPU_MAX_RETRY_LIMIT		2  #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) +static const struct drm_driver amdgpu_kms_driver; +  const char *amdgpu_asic_name[] = {  	"TAHITI",  	"PITCAIRN", @@ -159,7 +164,7 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);   *   * The amdgpu driver provides a sysfs API for reporting the product name   * for the device - * The file serial_number is used for this and returns the product name + * The file product_name is used for this and returns the product name   * as returned from the FRU.   * NOTE: This is only available for certain server cards   */ @@ -181,7 +186,7 @@ static DEVICE_ATTR(product_name, S_IRUGO,   *   * The amdgpu driver provides a sysfs API for reporting the part number   * for the device - * The file serial_number is used for this and returns the part number + * The file product_number is used for this and returns the part number   * as returned from the FRU.   * NOTE: This is only available for certain server cards   */ @@ -923,32 +928,33 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)  }  /** - * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page + * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page   *   * @adev: amdgpu_device pointer   *   * Allocates a scratch page of VRAM for use by various things in the   * driver.   */ -static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) +static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)  { -	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, -				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, -				       &adev->vram_scratch.robj, -				       &adev->vram_scratch.gpu_addr, -				       (void **)&adev->vram_scratch.ptr); +	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, +				       AMDGPU_GEM_DOMAIN_VRAM | +				       AMDGPU_GEM_DOMAIN_GTT, +				       &adev->mem_scratch.robj, +				       &adev->mem_scratch.gpu_addr, +				       (void **)&adev->mem_scratch.ptr);  }  /** - * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page + * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page   *   * @adev: amdgpu_device pointer   *   * Frees the VRAM scratch page.   */ -static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) +static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)  { -	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); +	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);  }  /** @@ -1568,7 +1574,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)   * @pdev: pci dev pointer   * @state: vga_switcheroo state   * - * Callback for the switcheroo driver.  Suspends or resumes the + * Callback for the switcheroo driver.  Suspends or resumes   * the asics before or after it is powered up using ACPI methods.   */  static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, @@ -1915,6 +1921,16 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)  	}  } +void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) +{ +	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { +		adev->mode_info.num_crtc = 1; +		adev->enable_virtual_display = true; +		DRM_INFO("virtual_display:%d, num_crtc:%d\n", +			 adev->enable_virtual_display, adev->mode_info.num_crtc); +	} +} +  /**   * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware   * @@ -1970,17 +1986,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  	}  	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); -	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); +	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);  	if (err) {  		dev_err(adev->dev, -			"Failed to load gpu_info firmware \"%s\"\n", -			fw_name); -		goto out; -	} -	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); -	if (err) { -		dev_err(adev->dev, -			"Failed to validate gpu_info firmware \"%s\"\n", +			"Failed to get gpu_info firmware \"%s\"\n",  			fw_name);  		goto out;  	} @@ -2067,6 +2076,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  	struct drm_device *dev = adev_to_drm(adev);  	struct pci_dev *parent;  	int i, r; +	bool total;  	amdgpu_device_enable_virtual_display(adev); @@ -2150,6 +2160,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)  		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; +	total = true;  	for (i = 0; i < adev->num_ip_blocks; i++) {  		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {  			DRM_ERROR("disabled ip block: %d <%s>\n", @@ -2163,7 +2174,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  				} else if (r) {  					DRM_ERROR("early_init of IP block <%s> failed %d\n",  						  adev->ip_blocks[i].version->funcs->name, r); -					return r; +					total = false;  				} else {  					adev->ip_blocks[i].status.valid = true;  				} @@ -2194,6 +2205,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  		}  	} +	if (!total) +		return -ENODEV;  	adev->cg_flags &= amdgpu_cg_mask;  	adev->pg_flags &= amdgpu_pg_mask; @@ -2379,9 +2392,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  			if (amdgpu_sriov_vf(adev))  				amdgpu_virt_exchange_data(adev); -			r = amdgpu_device_vram_scratch_init(adev); +			r = amdgpu_device_mem_scratch_init(adev);  			if (r) { -				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); +				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);  				goto init_failed;  			}  			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); @@ -2397,10 +2410,11 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  			adev->ip_blocks[i].status.hw = true;  			/* right after GMC hw init, we create CSA */ -			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { +			if (amdgpu_mcbp) {  				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, -								AMDGPU_GEM_DOMAIN_VRAM, -								AMDGPU_CSA_SIZE); +							       AMDGPU_GEM_DOMAIN_VRAM | +							       AMDGPU_GEM_DOMAIN_GTT, +							       AMDGPU_CSA_SIZE);  				if (r) {  					DRM_ERROR("allocate CSA failed %d\n", r);  					goto init_failed; @@ -2462,6 +2476,11 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  			if (!amdgpu_sriov_vf(adev)) {  				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +				if (WARN_ON(!hive)) { +					r = -ENOENT; +					goto init_failed; +				} +  				if (!hive->reset_domain ||  				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {  					r = -ENOENT; @@ -2565,9 +2584,10 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,  		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;  		if (!adev->ip_blocks[i].status.late_initialized)  			continue; -		/* skip CG for GFX on S0ix */ +		/* skip CG for GFX, SDMA on S0ix */  		if (adev->in_s0ix && -		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) +		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))  			continue;  		/* skip CG for VCE/UVD, it's handled specially */  		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && @@ -2601,9 +2621,10 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,  		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;  		if (!adev->ip_blocks[i].status.late_initialized)  			continue; -		/* skip PG for GFX on S0ix */ +		/* skip PG for GFX, SDMA on S0ix */  		if (adev->in_s0ix && -		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) +		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))  			continue;  		/* skip CG for VCE/UVD, it's handled specially */  		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && @@ -2855,7 +2876,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)  			amdgpu_ucode_free_bo(adev);  			amdgpu_free_static_csa(&adev->virt.csa_obj);  			amdgpu_device_wb_fini(adev); -			amdgpu_device_vram_scratch_fini(adev); +			amdgpu_device_mem_scratch_fini(adev);  			amdgpu_ib_pool_fini(adev);  		} @@ -3000,14 +3021,33 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)  			continue;  		} -		/* skip suspend of gfx and psp for S0ix +		/* skip suspend of gfx/mes and psp for S0ix  		 * gfx is in gfxoff state, so on resume it will exit gfxoff just  		 * like at runtime. PSP is also part of the always on hardware  		 * so no need to suspend it.  		 */  		if (adev->in_s0ix &&  		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || -		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || +		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) +			continue; + +		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ +		if (adev->in_s0ix && +		    (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && +		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) +			continue; + +		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. +		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload +		 * from this location and RLC Autoload automatically also gets loaded +		 * from here based on PMFW -> PSP message during re-init sequence. +		 * Therefore, the psp suspend & resume should be skipped to avoid destroy +		 * the TMR and reload FWs again for IMU enabled APU ASICs. +		 */ +		if (amdgpu_in_reset(adev) && +		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && +		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)  			continue;  		/* XXX handle errors */ @@ -3210,15 +3250,6 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)  			return r;  		}  		adev->ip_blocks[i].status.hw = true; - -		if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { -			/* disable gfxoff for IP resume. The gfxoff will be re-enabled in -			 * amdgpu_device_resume() after IP resume. -			 */ -			amdgpu_gfx_off_ctrl(adev, false); -			DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n"); -		} -  	}  	return 0; @@ -3347,8 +3378,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)   */  bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)  { -	if (amdgpu_sriov_vf(adev) || -	    adev->enable_virtual_display || +	if (adev->enable_virtual_display ||  	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))  		return false; @@ -3671,6 +3701,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	if (r)  		return r; +	/* Get rid of things like offb */ +	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); +	if (r) +		return r; +  	/* Enable TMZ based on IP_VERSION */  	amdgpu_gmc_tmz_set(adev); @@ -3973,10 +4008,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  	}  	amdgpu_fence_driver_hw_fini(adev); -	if (adev->mman.initialized) { -		flush_delayed_work(&adev->mman.bdev.wq); -		ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); -	} +	if (adev->mman.initialized) +		drain_workqueue(adev->mman.bdev.wq);  	if (adev->pm_sysfs_en)  		amdgpu_pm_sysfs_fini(adev); @@ -3998,7 +4031,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  	amdgpu_gart_dummy_page_fini(adev); -	amdgpu_device_unmap_mmio(adev); +	if (drm_dev_is_unplugged(adev_to_drm(adev))) +		amdgpu_device_unmap_mmio(adev);  } @@ -4008,8 +4042,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)  	amdgpu_fence_driver_sw_fini(adev);  	amdgpu_device_ip_fini(adev); -	release_firmware(adev->firmware.gpu_info_fw); -	adev->firmware.gpu_info_fw = NULL; +	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);  	adev->accel_working = false;  	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); @@ -4097,6 +4130,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	adev->in_suspend = true; +	/* Evict the majority of BOs before grabbing the full access */ +	r = amdgpu_device_evict_resources(adev); +	if (r) +		return r; +  	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_fini_data_exchange(adev);  		r = amdgpu_virt_request_full_gpu(adev, false); @@ -4171,21 +4209,15 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	r = amdgpu_device_ip_resume(adev); -	/* no matter what r is, always need to properly release full GPU */ -	if (amdgpu_sriov_vf(adev)) { -		amdgpu_virt_init_data_exchange(adev); -		amdgpu_virt_release_full_gpu(adev, true); -	} -  	if (r) {  		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); -		return r; +		goto exit;  	}  	amdgpu_fence_driver_hw_init(adev);  	r = amdgpu_device_ip_late_init(adev);  	if (r) -		return r; +		goto exit;  	queue_delayed_work(system_wq, &adev->delayed_init_work,  			   msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -4193,19 +4225,21 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	if (!adev->in_s0ix) {  		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);  		if (r) -			return r; +			goto exit; +	} + +exit: +	if (amdgpu_sriov_vf(adev)) { +		amdgpu_virt_init_data_exchange(adev); +		amdgpu_virt_release_full_gpu(adev, true);  	} +	if (r) +		return r; +  	/* Make sure IB tests flushed */  	flush_delayed_work(&adev->delayed_init_work); -	if (adev->in_s0ix) { -		/* re-enable gfxoff after IP resume. This re-enables gfxoff after -		 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2(). -		 */ -		amdgpu_gfx_off_ctrl(adev, true); -		DRM_DEBUG("will enable gfxoff for the mission mode\n"); -	}  	if (fbcon)  		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); @@ -4213,27 +4247,32 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	amdgpu_ras_resume(adev); -	/* -	 * Most of the connector probing functions try to acquire runtime pm -	 * refs to ensure that the GPU is powered on when connector polling is -	 * performed. Since we're calling this from a runtime PM callback, -	 * trying to acquire rpm refs will cause us to deadlock. -	 * -	 * Since we're guaranteed to be holding the rpm lock, it's safe to -	 * temporarily disable the rpm helpers so this doesn't deadlock us. -	 */ +	if (adev->mode_info.num_crtc) { +		/* +		 * Most of the connector probing functions try to acquire runtime pm +		 * refs to ensure that the GPU is powered on when connector polling is +		 * performed. Since we're calling this from a runtime PM callback, +		 * trying to acquire rpm refs will cause us to deadlock. +		 * +		 * Since we're guaranteed to be holding the rpm lock, it's safe to +		 * temporarily disable the rpm helpers so this doesn't deadlock us. +		 */  #ifdef CONFIG_PM -	dev->dev->power.disable_depth++; +		dev->dev->power.disable_depth++;  #endif -	if (!amdgpu_device_has_dc_support(adev)) -		drm_helper_hpd_irq_event(dev); -	else -		drm_kms_helper_hotplug_event(dev); +		if (!adev->dc_enabled) +			drm_helper_hpd_irq_event(dev); +		else +			drm_kms_helper_hotplug_event(dev);  #ifdef CONFIG_PM -	dev->dev->power.disable_depth--; +		dev->dev->power.disable_depth--;  #endif +	}  	adev->in_suspend = false; +	if (adev->enable_mes) +		amdgpu_mes_self_test(adev); +  	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))  		DRM_WARN("smart shift update failed\n"); @@ -4580,10 +4619,9 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)  	if (amdgpu_gpu_recovery == 0)  		goto disabled; -	if (!amdgpu_device_ip_check_soft_reset(adev)) { -		dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); -		return false; -	} +	/* Skip soft reset check in fatal error mode */ +	if (!amdgpu_ras_is_poison_mode_supported(adev)) +		return true;  	if (amdgpu_sriov_vf(adev))  		return true; @@ -4709,7 +4747,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  		if (!need_full_reset)  			need_full_reset = amdgpu_device_ip_need_full_reset(adev); -		if (!need_full_reset && amdgpu_gpu_recovery) { +		if (!need_full_reset && amdgpu_gpu_recovery && +		    amdgpu_device_ip_check_soft_reset(adev)) {  			amdgpu_device_ip_pre_soft_reset(adev);  			r = amdgpu_device_ip_soft_reset(adev);  			amdgpu_device_ip_post_soft_reset(adev); @@ -5027,6 +5066,8 @@ static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)  		pm_runtime_enable(&(p->dev));  		pm_runtime_resume(&(p->dev));  	} + +	pci_dev_put(p);  }  static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) @@ -5065,6 +5106,7 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)  		if (expires < ktime_get_mono_fast_ns()) {  			dev_warn(adev->dev, "failed to suspend display audio\n"); +			pci_dev_put(p);  			/* TODO: abort the succeeding gpu reset? */  			return -ETIMEDOUT;  		} @@ -5072,97 +5114,10 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)  	pm_runtime_disable(&(p->dev)); +	pci_dev_put(p);  	return 0;  } -static void amdgpu_device_recheck_guilty_jobs( -	struct amdgpu_device *adev, struct list_head *device_list_handle, -	struct amdgpu_reset_context *reset_context) -{ -	int i, r = 0; - -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { -		struct amdgpu_ring *ring = adev->rings[i]; -		int ret = 0; -		struct drm_sched_job *s_job; - -		if (!ring || !ring->sched.thread) -			continue; - -		s_job = list_first_entry_or_null(&ring->sched.pending_list, -				struct drm_sched_job, list); -		if (s_job == NULL) -			continue; - -		/* clear job's guilty and depend the folowing step to decide the real one */ -		drm_sched_reset_karma(s_job); -		drm_sched_resubmit_jobs_ext(&ring->sched, 1); - -		if (!s_job->s_fence->parent) { -			DRM_WARN("Failed to get a HW fence for job!"); -			continue; -		} - -		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); -		if (ret == 0) { /* timeout */ -			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", -						ring->sched.name, s_job->id); - - -			amdgpu_fence_driver_isr_toggle(adev, true); - -			/* Clear this failed job from fence array */ -			amdgpu_fence_driver_clear_job_fences(ring); - -			amdgpu_fence_driver_isr_toggle(adev, false); - -			/* Since the job won't signal and we go for -			 * another resubmit drop this parent pointer -			 */ -			dma_fence_put(s_job->s_fence->parent); -			s_job->s_fence->parent = NULL; - -			/* set guilty */ -			drm_sched_increase_karma(s_job); -			amdgpu_reset_prepare_hwcontext(adev, reset_context); -retry: -			/* do hw reset */ -			if (amdgpu_sriov_vf(adev)) { -				amdgpu_virt_fini_data_exchange(adev); -				r = amdgpu_device_reset_sriov(adev, false); -				if (r) -					adev->asic_reset_res = r; -			} else { -				clear_bit(AMDGPU_SKIP_HW_RESET, -					  &reset_context->flags); -				r = amdgpu_do_asic_reset(device_list_handle, -							 reset_context); -				if (r && r == -EAGAIN) -					goto retry; -			} - -			/* -			 * add reset counter so that the following -			 * resubmitted job could flush vmid -			 */ -			atomic_inc(&adev->gpu_reset_counter); -			continue; -		} - -		/* got the hw fence, signal finished fence */ -		atomic_dec(ring->sched.score); -		dma_fence_get(&s_job->s_fence->finished); -		dma_fence_signal(&s_job->s_fence->finished); -		dma_fence_put(&s_job->s_fence->finished); - -		/* remove node from list and free the job */ -		spin_lock(&ring->sched.job_list_lock); -		list_del_init(&s_job->list); -		spin_unlock(&ring->sched.job_list_lock); -		ring->sched.ops->free_job(s_job); -	} -} -  static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -5183,7 +5138,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  } -  /**   * amdgpu_device_gpu_recover - reset the asic and recover scheduler   * @@ -5206,7 +5160,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	int i, r = 0;  	bool need_emergency_restart = false;  	bool audio_suspended = false; -	int tmp_vram_lost_counter;  	bool gpu_reset_for_dev_remove = false;  	gpu_reset_for_dev_remove = @@ -5352,7 +5305,6 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  		amdgpu_device_stop_pending_resets(tmp_adev);  	} -	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));  	/* Actual ASIC resets if needed.*/  	/* Host driver will handle XGMI hive reset for SRIOV */  	if (amdgpu_sriov_vf(adev)) { @@ -5377,29 +5329,13 @@ skip_hw_reset:  	/* Post ASIC reset for all devs .*/  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { -		/* -		 * Sometimes a later bad compute job can block a good gfx job as gfx -		 * and compute ring share internal GC HW mutually. We add an additional -		 * guilty jobs recheck step to find the real guilty job, it synchronously -		 * submits and pends for the first job being signaled. If it gets timeout, -		 * we identify it as a real guilty job. -		 */ -		if (amdgpu_gpu_recovery == 2 && -			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) -			amdgpu_device_recheck_guilty_jobs( -				tmp_adev, device_list_handle, reset_context); -  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = tmp_adev->rings[i];  			if (!ring || !ring->sched.thread)  				continue; -			/* No point to resubmit jobs if we didn't HW reset*/ -			if (!tmp_adev->asic_reset_res && !job_signaled) -				drm_sched_resubmit_jobs(&ring->sched); - -			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); +			drm_sched_start(&ring->sched, true);  		}  		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) @@ -5441,6 +5377,8 @@ skip_sched_resume:  			amdgpu_device_resume_display_audio(tmp_adev);  		amdgpu_device_unset_mp1_state(tmp_adev); + +		amdgpu_ras_set_error_query_ready(tmp_adev, true);  	}  recover_end: @@ -5852,8 +5790,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)  		if (!ring || !ring->sched.thread)  			continue; - -		drm_sched_resubmit_jobs(&ring->sched);  		drm_sched_start(&ring->sched, true);  	} @@ -5938,8 +5874,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,  int amdgpu_in_reset(struct amdgpu_device *adev)  {  	return atomic_read(&adev->reset_domain->in_gpu_reset); -	} -	 +} +  /**   * amdgpu_device_halt() - bring hardware to some kind of halt state   *  |