aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.c18
2 files changed, 23 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64bee125f17a..d0307c55da50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2934,8 +2934,12 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
struct ras_err_data err_data;
unsigned long err_cnt;
- if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
+ /* If gpu reset is ongoing, delay retiring the bad pages */
+ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
+ amdgpu_ras_schedule_retirement_dwork(con,
+ AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);
return;
+ }
amdgpu_ras_error_data_init(&err_data);
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 0faa21d8a7b4..9dbb13adb661 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -29,6 +29,7 @@
#include "mp/mp_13_0_6_sh_mask.h"
#define MAX_ECC_NUM_PER_RETIREMENT 32
+#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
@@ -568,6 +569,23 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
con->umc_ecc_log.de_queried_count++;
+ /* The problem case is as follows:
+ * 1. GPU A triggers a gpu ras reset, and GPU A drives
+ * GPU B to also perform a gpu ras reset.
+ * 2. After gpu B ras reset started, gpu B queried a DE
+ * data. Since the DE data was queried in the ras reset
+ * thread instead of the page retirement thread, bad
+ * page retirement work would not be triggered. Then
+ * even if all gpu resets are completed, the bad pages
+ * will be cached in RAM until GPU B's bad page retirement
+ * work is triggered again and then saved to eeprom.
+ * Trigger delayed work to save the bad pages to eeprom in time
+ * after gpu ras reset is completed.
+ */
+ if (amdgpu_ras_in_recovery(adev))
+ schedule_delayed_work(&con->page_retirement_dwork,
+ msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
+
return 0;
}