diff options
author | Tao Zhou <tao.zhou1@amd.com> | 2021-09-17 18:40:57 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2021-09-28 09:30:07 -0400 |
commit | f524dd54a78924b59acd8f251788889129b3a2e9 (patch) | |
tree | eaa3ec8ca04b047618f753c9926345755e9d1ecc /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | e43488493cbb46e862f83c66887f3e6cb854c6f0 (diff) |
drm/amdgpu: skip umc ras irq handling in poison mode (v2)
In ras poison mode, umc uncorrectable error will be ignored until
the corrupted data consumed by another ras module (such as gfx, sdma).
v2: update the debug message and replace dev_warn with dev_info.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 |
1 files changed, 20 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4c547eee5702..8243f79a7c4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1544,22 +1544,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) data->rptr = (data->aligned_element_size + data->rptr) % data->ring_size; - /* Let IP handle its data, maybe we need get the output - * from the callback to udpate the error type/count, etc - */ if (data->cb) { - ret = data->cb(obj->adev, &err_data, &entry); - /* ue will trigger an interrupt, and in that case - * we need do a reset to recovery the whole system. - * But leave IP do that recovery, here we just dispatch - * the error. - */ - if (ret == AMDGPU_RAS_SUCCESS) { - /* these counts could be left as 0 if - * some blocks do not count error number + if (amdgpu_ras_is_poison_mode_supported(obj->adev) && + obj->head.block == AMDGPU_RAS_BLOCK__UMC) + dev_info(obj->adev->dev, + "Poison is created, no user action is needed.\n"); + else { + /* Let IP handle its data, maybe we need get the output + * from the callback to udpate the error type/count, etc + */ + ret = data->cb(obj->adev, &err_data, &entry); + /* ue will trigger an interrupt, and in that case + * we need do a reset to recovery the whole system. + * But leave IP do that recovery, here we just dispatch + * the error. */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; + if (ret == AMDGPU_RAS_SUCCESS) { + /* these counts could be left as 0 if + * some blocks do not count error number + */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } } } } |