From 367806068283725d91ffcf49149db1cda5c1fd23 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 9 May 2022 14:49:16 +0800 Subject: drm/amdgpu: enable RAS IH for poison consumption Enable RAS IH if poison consumption handler is implemented. Signed-off-by: Tao Zhou Reviewed-by: Mohammad Zafar Ziya Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 7e126dff004f..defc6a53c7dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2516,7 +2516,9 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, return 0; ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); - if (ras_obj->ras_cb) { + if (ras_obj->ras_cb || (ras_obj->hw_ops && + (ras_obj->hw_ops->query_poison_status || + ras_obj->hw_ops->handle_poison_consumption))) { r = amdgpu_ras_interrupt_add_handler(adev, ras_block); if (r) goto cleanup; -- cgit From b63ac5d3033976301f296d048c54d584dfb3ac30 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 9 May 2022 17:52:15 +0800 Subject: drm/amdgpu: refine RAS poison consumption handler Qeury ras status before ras poison consumption handling, add more comment and log. Signed-off-by: Tao Zhou Reviewed-and-tested-by: Mohammad Zafar Ziya Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index defc6a53c7dc..035891ec59d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1538,33 +1538,42 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { - bool poison_stat = true, need_reset = true; + bool poison_stat = false; struct amdgpu_device *adev = obj->adev; struct ras_err_data err_data = {0, 0, 0, NULL}; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); - if (!adev->gmc.xgmi.connected_to_cpu) - amdgpu_umc_poison_handler(adev, &err_data, false); - - /* both query_poison_status and handle_poison_consumption are optional */ - if (block_obj && block_obj->hw_ops) { - if (block_obj->hw_ops->query_poison_status) { - poison_stat = block_obj->hw_ops->query_poison_status(adev); - if (!poison_stat) - dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", - block_obj->ras_comm.name); - } + if (!block_obj || !block_obj->hw_ops) + return; - if (poison_stat && block_obj->hw_ops->handle_poison_consumption) { - poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); - need_reset = poison_stat; + /* both query_poison_status and handle_poison_consumption are optional, + * but at least one of them should be implemented if we need poison + * consumption handler + */ + if (block_obj->hw_ops->query_poison_status) { + poison_stat = block_obj->hw_ops->query_poison_status(adev); + if (!poison_stat) { + /* Not poison consumption interrupt, no need to handle it */ + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", + block_obj->ras_comm.name); + + return; } } - /* gpu reset is fallback for all failed cases */ - if (need_reset) + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_poison_handler(adev, &err_data, false); + + if (block_obj->hw_ops->handle_poison_consumption) + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); + + /* gpu reset is fallback for failed and default cases */ + if (poison_stat) { + dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", + block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); + } } static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, -- cgit