aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2023-05-16 17:34:17 +0800
committerAlex Deucher <alexander.deucher@amd.com>2023-06-09 10:38:19 -0400
commit6c47a79b3b8ba91faf89f9866da2ec16aac979e7 (patch)
tree12ac37a38cfb500ad2a3ebb8b829a7335ac895a1
parent5d0622705ff76e017b32cb763cbc7b00694f3b92 (diff)
drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3
perform mode2 reset for sdma fed error on gfx v11_0_3. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c14
3 files changed, 25 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6bb438642cc0..f2da69adcd9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
- else
+ else {
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+ ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ reset_context.method = AMD_RESET_METHOD_MODE2;
+ }
+ }
+
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
atomic_set(&ras->in_recovery, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc43f7db17cc..46bf1889a9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
#define AMDGPU_RAS_ERR_STATUS_VALID (1 << 1)
#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2)
+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0)
+
struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+ /* Record special requirements of gpu reset caller */
+ uint32_t gpu_reset_flags;
};
struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 068b9586a223..26d6286d86c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
- !entry->vmid && !entry->pasid)
+ !entry->vmid && !entry->pasid) {
+ uint32_t rlc_status0 = 0;
+
+ rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+
+ if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA0_FED_ERR) ||
+ REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, SDMA1_FED_ERR)) {
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ }
+
amdgpu_ras_reset_gpu(adev);
+ }
return 0;
}