aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2024-06-24 11:21:06 +0800
committerAlex Deucher <alexander.deucher@amd.com>2024-06-27 17:31:20 -0400
commit78146c1dcd220ae98fd5f4114f992299fc5ee161 (patch)
treeaa6b5806f7d4b7c9d61c16b0d8bbdb6119fb7ef9 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent29b6985de50b6bf4de77aa680b875a4362d7b30d (diff)
drm/amdgpu: add variable to record the deferred error number read by driver
Add variable to record the deferred error number read by driver. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c62
1 files changed, 44 insertions, 18 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 68e9935028db..f67ce3684aa7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
@@ -2799,7 +2799,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
- ecc_log->de_updated = false;
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -2818,7 +2819,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock);
- ecc_log->de_updated = false;
+ ecc_log->de_queried_count = 0;
+ ecc_log->prev_de_queried_count = 0;
}
static void amdgpu_ras_do_page_retirement(struct work_struct *work)
@@ -2850,40 +2852,64 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
mutex_unlock(&con->umc_ecc_log.lock);
}
-static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
- uint32_t timeout_ms)
+static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+ uint32_t poison_creation_count)
{
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = timeout_ms;
+ uint32_t timeout = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ uint64_t de_queried_count;
+ uint32_t new_detect_count, total_detect_count;
+ uint32_t need_query_count = poison_creation_count;
+ bool query_data_timeout = false;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
- ecc_log->de_updated = false;
+ total_detect_count = 0;
do {
ret = amdgpu_ras_query_error_status(adev, &info);
- if (ret) {
- dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
- return;
+ if (ret)
+ return ret;
+
+ de_queried_count = ecc_log->de_queried_count;
+ if (de_queried_count > ecc_log->prev_de_queried_count) {
+ new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
+ ecc_log->prev_de_queried_count = de_queried_count;
+ timeout = 0;
+ } else {
+ new_detect_count = 0;
}
- if (timeout && !ecc_log->de_updated) {
- msleep(1);
- timeout--;
+ if (new_detect_count) {
+ total_detect_count += new_detect_count;
+ } else {
+ if (!timeout && need_query_count)
+ timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+
+ if (timeout) {
+ if (!--timeout) {
+ query_data_timeout = true;
+ break;
+ }
+ msleep(1);
+ }
}
- } while (timeout && !ecc_log->de_updated);
+ } while (total_detect_count < need_query_count);
- if (timeout_ms && !timeout) {
- dev_warn(adev->dev, "Can't find deferred error\n");
- return;
+ if (query_data_timeout) {
+ dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
+ (need_query_count - total_detect_count));
+ return -ENOENT;
}
- if (!ret)
+ if (total_detect_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
+
+ return 0;
}
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,