aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c338
1 files changed, 137 insertions, 201 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f1e9663b4051..c4a4e2fe6681 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -36,7 +36,10 @@
#include <generated/utsrelease.h>
#include <linux/pci-p2pdma.h>
+#include <drm/drm_aperture.h>
#include <drm/drm_atomic_helper.h>
+#include <drm/drm_crtc_helper.h>
+#include <drm/drm_fb_helper.h>
#include <drm/drm_probe_helper.h>
#include <drm/amdgpu_drm.h>
#include <linux/vgaarb.h>
@@ -89,6 +92,8 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
#define AMDGPU_MAX_RETRY_LIMIT 2
#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
+static const struct drm_driver amdgpu_kms_driver;
+
const char *amdgpu_asic_name[] = {
"TAHITI",
"PITCAIRN",
@@ -159,7 +164,7 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
*
* The amdgpu driver provides a sysfs API for reporting the product name
* for the device
- * The file serial_number is used for this and returns the product name
+ * The file product_name is used for this and returns the product name
* as returned from the FRU.
* NOTE: This is only available for certain server cards
*/
@@ -181,7 +186,7 @@ static DEVICE_ATTR(product_name, S_IRUGO,
*
* The amdgpu driver provides a sysfs API for reporting the part number
* for the device
- * The file serial_number is used for this and returns the part number
+ * The file product_number is used for this and returns the part number
* as returned from the FRU.
* NOTE: This is only available for certain server cards
*/
@@ -923,32 +928,33 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)
}
/**
- * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
+ * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
*
* @adev: amdgpu_device pointer
*
* Allocates a scratch page of VRAM for use by various things in the
* driver.
*/
-static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
+static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
{
- return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
- PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
- &adev->vram_scratch.robj,
- &adev->vram_scratch.gpu_addr,
- (void **)&adev->vram_scratch.ptr);
+ return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
+ AMDGPU_GEM_DOMAIN_VRAM |
+ AMDGPU_GEM_DOMAIN_GTT,
+ &adev->mem_scratch.robj,
+ &adev->mem_scratch.gpu_addr,
+ (void **)&adev->mem_scratch.ptr);
}
/**
- * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
+ * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
*
* @adev: amdgpu_device pointer
*
* Frees the VRAM scratch page.
*/
-static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
+static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
{
- amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
+ amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
}
/**
@@ -1568,7 +1574,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
* @pdev: pci dev pointer
* @state: vga_switcheroo state
*
- * Callback for the switcheroo driver. Suspends or resumes the
+ * Callback for the switcheroo driver. Suspends or resumes
* the asics before or after it is powered up using ACPI methods.
*/
static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
@@ -1915,6 +1921,16 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
}
}
+void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
+{
+ if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
+ adev->mode_info.num_crtc = 1;
+ adev->enable_virtual_display = true;
+ DRM_INFO("virtual_display:%d, num_crtc:%d\n",
+ adev->enable_virtual_display, adev->mode_info.num_crtc);
+ }
+}
+
/**
* amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
*
@@ -1970,17 +1986,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
}
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
- err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
+ err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
if (err) {
dev_err(adev->dev,
- "Failed to load gpu_info firmware \"%s\"\n",
- fw_name);
- goto out;
- }
- err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
- if (err) {
- dev_err(adev->dev,
- "Failed to validate gpu_info firmware \"%s\"\n",
+ "Failed to get gpu_info firmware \"%s\"\n",
fw_name);
goto out;
}
@@ -2067,6 +2076,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
struct drm_device *dev = adev_to_drm(adev);
struct pci_dev *parent;
int i, r;
+ bool total;
amdgpu_device_enable_virtual_display(adev);
@@ -2150,6 +2160,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
+ total = true;
for (i = 0; i < adev->num_ip_blocks; i++) {
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
DRM_ERROR("disabled ip block: %d <%s>\n",
@@ -2163,7 +2174,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
} else if (r) {
DRM_ERROR("early_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
- return r;
+ total = false;
} else {
adev->ip_blocks[i].status.valid = true;
}
@@ -2194,6 +2205,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
}
}
+ if (!total)
+ return -ENODEV;
adev->cg_flags &= amdgpu_cg_mask;
adev->pg_flags &= amdgpu_pg_mask;
@@ -2379,9 +2392,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
amdgpu_virt_exchange_data(adev);
- r = amdgpu_device_vram_scratch_init(adev);
+ r = amdgpu_device_mem_scratch_init(adev);
if (r) {
- DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
+ DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
goto init_failed;
}
r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
@@ -2397,10 +2410,11 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = true;
/* right after GMC hw init, we create CSA */
- if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
+ if (amdgpu_mcbp) {
r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
- AMDGPU_GEM_DOMAIN_VRAM,
- AMDGPU_CSA_SIZE);
+ AMDGPU_GEM_DOMAIN_VRAM |
+ AMDGPU_GEM_DOMAIN_GTT,
+ AMDGPU_CSA_SIZE);
if (r) {
DRM_ERROR("allocate CSA failed %d\n", r);
goto init_failed;
@@ -2462,6 +2476,11 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev)) {
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ if (WARN_ON(!hive)) {
+ r = -ENOENT;
+ goto init_failed;
+ }
+
if (!hive->reset_domain ||
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
r = -ENOENT;
@@ -2565,9 +2584,10 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
if (!adev->ip_blocks[i].status.late_initialized)
continue;
- /* skip CG for GFX on S0ix */
+ /* skip CG for GFX, SDMA on S0ix */
if (adev->in_s0ix &&
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
+ (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
/* skip CG for VCE/UVD, it's handled specially */
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
@@ -2601,9 +2621,10 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
if (!adev->ip_blocks[i].status.late_initialized)
continue;
- /* skip PG for GFX on S0ix */
+ /* skip PG for GFX, SDMA on S0ix */
if (adev->in_s0ix &&
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
+ (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
/* skip CG for VCE/UVD, it's handled specially */
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
@@ -2855,7 +2876,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_ucode_free_bo(adev);
amdgpu_free_static_csa(&adev->virt.csa_obj);
amdgpu_device_wb_fini(adev);
- amdgpu_device_vram_scratch_fini(adev);
+ amdgpu_device_mem_scratch_fini(adev);
amdgpu_ib_pool_fini(adev);
}
@@ -3000,14 +3021,33 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
continue;
}
- /* skip suspend of gfx and psp for S0ix
+ /* skip suspend of gfx/mes and psp for S0ix
* gfx is in gfxoff state, so on resume it will exit gfxoff just
* like at runtime. PSP is also part of the always on hardware
* so no need to suspend it.
*/
if (adev->in_s0ix &&
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
+ continue;
+
+ /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
+ if (adev->in_s0ix &&
+ (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
+ (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
+ continue;
+
+ /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
+ * These are in TMR, hence are expected to be reused by PSP-TOS to reload
+ * from this location and RLC Autoload automatically also gets loaded
+ * from here based on PMFW -> PSP message during re-init sequence.
+ * Therefore, the psp suspend & resume should be skipped to avoid destroy
+ * the TMR and reload FWs again for IMU enabled APU ASICs.
+ */
+ if (amdgpu_in_reset(adev) &&
+ (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
/* XXX handle errors */
@@ -3210,15 +3250,6 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
return r;
}
adev->ip_blocks[i].status.hw = true;
-
- if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
- /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
- * amdgpu_device_resume() after IP resume.
- */
- amdgpu_gfx_off_ctrl(adev, false);
- DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
- }
-
}
return 0;
@@ -3347,8 +3378,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
*/
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
{
- if (amdgpu_sriov_vf(adev) ||
- adev->enable_virtual_display ||
+ if (adev->enable_virtual_display ||
(adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
@@ -3671,6 +3701,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
+ /* Get rid of things like offb */
+ r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
+ if (r)
+ return r;
+
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
@@ -3973,10 +4008,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
}
amdgpu_fence_driver_hw_fini(adev);
- if (adev->mman.initialized) {
- flush_delayed_work(&adev->mman.bdev.wq);
- ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
- }
+ if (adev->mman.initialized)
+ drain_workqueue(adev->mman.bdev.wq);
if (adev->pm_sysfs_en)
amdgpu_pm_sysfs_fini(adev);
@@ -3998,7 +4031,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_gart_dummy_page_fini(adev);
- amdgpu_device_unmap_mmio(adev);
+ if (drm_dev_is_unplugged(adev_to_drm(adev)))
+ amdgpu_device_unmap_mmio(adev);
}
@@ -4008,8 +4042,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
amdgpu_fence_driver_sw_fini(adev);
amdgpu_device_ip_fini(adev);
- release_firmware(adev->firmware.gpu_info_fw);
- adev->firmware.gpu_info_fw = NULL;
+ amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
adev->accel_working = false;
dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
@@ -4097,6 +4130,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
adev->in_suspend = true;
+ /* Evict the majority of BOs before grabbing the full access */
+ r = amdgpu_device_evict_resources(adev);
+ if (r)
+ return r;
+
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_virt_request_full_gpu(adev, false);
@@ -4171,21 +4209,15 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
r = amdgpu_device_ip_resume(adev);
- /* no matter what r is, always need to properly release full GPU */
- if (amdgpu_sriov_vf(adev)) {
- amdgpu_virt_init_data_exchange(adev);
- amdgpu_virt_release_full_gpu(adev, true);
- }
-
if (r) {
dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
- return r;
+ goto exit;
}
amdgpu_fence_driver_hw_init(adev);
r = amdgpu_device_ip_late_init(adev);
if (r)
- return r;
+ goto exit;
queue_delayed_work(system_wq, &adev->delayed_init_work,
msecs_to_jiffies(AMDGPU_RESUME_MS));
@@ -4193,19 +4225,21 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
if (!adev->in_s0ix) {
r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
if (r)
- return r;
+ goto exit;
+ }
+
+exit:
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_init_data_exchange(adev);
+ amdgpu_virt_release_full_gpu(adev, true);
}
+ if (r)
+ return r;
+
/* Make sure IB tests flushed */
flush_delayed_work(&adev->delayed_init_work);
- if (adev->in_s0ix) {
- /* re-enable gfxoff after IP resume. This re-enables gfxoff after
- * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
- */
- amdgpu_gfx_off_ctrl(adev, true);
- DRM_DEBUG("will enable gfxoff for the mission mode\n");
- }
if (fbcon)
drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
@@ -4213,27 +4247,32 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
amdgpu_ras_resume(adev);
- /*
- * Most of the connector probing functions try to acquire runtime pm
- * refs to ensure that the GPU is powered on when connector polling is
- * performed. Since we're calling this from a runtime PM callback,
- * trying to acquire rpm refs will cause us to deadlock.
- *
- * Since we're guaranteed to be holding the rpm lock, it's safe to
- * temporarily disable the rpm helpers so this doesn't deadlock us.
- */
+ if (adev->mode_info.num_crtc) {
+ /*
+ * Most of the connector probing functions try to acquire runtime pm
+ * refs to ensure that the GPU is powered on when connector polling is
+ * performed. Since we're calling this from a runtime PM callback,
+ * trying to acquire rpm refs will cause us to deadlock.
+ *
+ * Since we're guaranteed to be holding the rpm lock, it's safe to
+ * temporarily disable the rpm helpers so this doesn't deadlock us.
+ */
#ifdef CONFIG_PM
- dev->dev->power.disable_depth++;
+ dev->dev->power.disable_depth++;
#endif
- if (!amdgpu_device_has_dc_support(adev))
- drm_helper_hpd_irq_event(dev);
- else
- drm_kms_helper_hotplug_event(dev);
+ if (!adev->dc_enabled)
+ drm_helper_hpd_irq_event(dev);
+ else
+ drm_kms_helper_hotplug_event(dev);
#ifdef CONFIG_PM
- dev->dev->power.disable_depth--;
+ dev->dev->power.disable_depth--;
#endif
+ }
adev->in_suspend = false;
+ if (adev->enable_mes)
+ amdgpu_mes_self_test(adev);
+
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
DRM_WARN("smart shift update failed\n");
@@ -4580,10 +4619,9 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
if (amdgpu_gpu_recovery == 0)
goto disabled;
- if (!amdgpu_device_ip_check_soft_reset(adev)) {
- dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
- return false;
- }
+ /* Skip soft reset check in fatal error mode */
+ if (!amdgpu_ras_is_poison_mode_supported(adev))
+ return true;
if (amdgpu_sriov_vf(adev))
return true;
@@ -4709,7 +4747,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!need_full_reset)
need_full_reset = amdgpu_device_ip_need_full_reset(adev);
- if (!need_full_reset && amdgpu_gpu_recovery) {
+ if (!need_full_reset && amdgpu_gpu_recovery &&
+ amdgpu_device_ip_check_soft_reset(adev)) {
amdgpu_device_ip_pre_soft_reset(adev);
r = amdgpu_device_ip_soft_reset(adev);
amdgpu_device_ip_post_soft_reset(adev);
@@ -5027,6 +5066,8 @@ static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
pm_runtime_enable(&(p->dev));
pm_runtime_resume(&(p->dev));
}
+
+ pci_dev_put(p);
}
static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
@@ -5065,6 +5106,7 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
if (expires < ktime_get_mono_fast_ns()) {
dev_warn(adev->dev, "failed to suspend display audio\n");
+ pci_dev_put(p);
/* TODO: abort the succeeding gpu reset? */
return -ETIMEDOUT;
}
@@ -5072,97 +5114,10 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
pm_runtime_disable(&(p->dev));
+ pci_dev_put(p);
return 0;
}
-static void amdgpu_device_recheck_guilty_jobs(
- struct amdgpu_device *adev, struct list_head *device_list_handle,
- struct amdgpu_reset_context *reset_context)
-{
- int i, r = 0;
-
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
- int ret = 0;
- struct drm_sched_job *s_job;
-
- if (!ring || !ring->sched.thread)
- continue;
-
- s_job = list_first_entry_or_null(&ring->sched.pending_list,
- struct drm_sched_job, list);
- if (s_job == NULL)
- continue;
-
- /* clear job's guilty and depend the folowing step to decide the real one */
- drm_sched_reset_karma(s_job);
- drm_sched_resubmit_jobs_ext(&ring->sched, 1);
-
- if (!s_job->s_fence->parent) {
- DRM_WARN("Failed to get a HW fence for job!");
- continue;
- }
-
- ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
- if (ret == 0) { /* timeout */
- DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
- ring->sched.name, s_job->id);
-
-
- amdgpu_fence_driver_isr_toggle(adev, true);
-
- /* Clear this failed job from fence array */
- amdgpu_fence_driver_clear_job_fences(ring);
-
- amdgpu_fence_driver_isr_toggle(adev, false);
-
- /* Since the job won't signal and we go for
- * another resubmit drop this parent pointer
- */
- dma_fence_put(s_job->s_fence->parent);
- s_job->s_fence->parent = NULL;
-
- /* set guilty */
- drm_sched_increase_karma(s_job);
- amdgpu_reset_prepare_hwcontext(adev, reset_context);
-retry:
- /* do hw reset */
- if (amdgpu_sriov_vf(adev)) {
- amdgpu_virt_fini_data_exchange(adev);
- r = amdgpu_device_reset_sriov(adev, false);
- if (r)
- adev->asic_reset_res = r;
- } else {
- clear_bit(AMDGPU_SKIP_HW_RESET,
- &reset_context->flags);
- r = amdgpu_do_asic_reset(device_list_handle,
- reset_context);
- if (r && r == -EAGAIN)
- goto retry;
- }
-
- /*
- * add reset counter so that the following
- * resubmitted job could flush vmid
- */
- atomic_inc(&adev->gpu_reset_counter);
- continue;
- }
-
- /* got the hw fence, signal finished fence */
- atomic_dec(ring->sched.score);
- dma_fence_get(&s_job->s_fence->finished);
- dma_fence_signal(&s_job->s_fence->finished);
- dma_fence_put(&s_job->s_fence->finished);
-
- /* remove node from list and free the job */
- spin_lock(&ring->sched.job_list_lock);
- list_del_init(&s_job->list);
- spin_unlock(&ring->sched.job_list_lock);
- ring->sched.ops->free_job(s_job);
- }
-}
-
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -5183,7 +5138,6 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
}
-
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -5206,7 +5160,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
- int tmp_vram_lost_counter;
bool gpu_reset_for_dev_remove = false;
gpu_reset_for_dev_remove =
@@ -5352,7 +5305,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
amdgpu_device_stop_pending_resets(tmp_adev);
}
- tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
@@ -5377,29 +5329,13 @@ skip_hw_reset:
/* Post ASIC reset for all devs .*/
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- /*
- * Sometimes a later bad compute job can block a good gfx job as gfx
- * and compute ring share internal GC HW mutually. We add an additional
- * guilty jobs recheck step to find the real guilty job, it synchronously
- * submits and pends for the first job being signaled. If it gets timeout,
- * we identify it as a real guilty job.
- */
- if (amdgpu_gpu_recovery == 2 &&
- !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
- amdgpu_device_recheck_guilty_jobs(
- tmp_adev, device_list_handle, reset_context);
-
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
if (!ring || !ring->sched.thread)
continue;
- /* No point to resubmit jobs if we didn't HW reset*/
- if (!tmp_adev->asic_reset_res && !job_signaled)
- drm_sched_resubmit_jobs(&ring->sched);
-
- drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
+ drm_sched_start(&ring->sched, true);
}
if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
@@ -5441,6 +5377,8 @@ skip_sched_resume:
amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unset_mp1_state(tmp_adev);
+
+ amdgpu_ras_set_error_query_ready(tmp_adev, true);
}
recover_end:
@@ -5852,8 +5790,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
if (!ring || !ring->sched.thread)
continue;
-
- drm_sched_resubmit_jobs(&ring->sched);
drm_sched_start(&ring->sched, true);
}
@@ -5938,8 +5874,8 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
int amdgpu_in_reset(struct amdgpu_device *adev)
{
return atomic_read(&adev->reset_domain->in_gpu_reset);
- }
-
+}
+
/**
* amdgpu_device_halt() - bring hardware to some kind of halt state
*