From 761d86d37f86ebba77e59fa59ccef4dc2f38674f Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Thu, 4 Feb 2021 13:32:05 +0800 Subject: drm/amdgpu: harvest edc status when connected to host via xGMI When connected to a host via xGMI, system fatal errors may trigger warm reset, driver has no change to query edc status before reset. Therefore in this case, driver should harvest previous error loging registers during boot, instead of only resetting them. v2: 1. IP's ras_manager object is created when its ras feature is enabled, so change to query edc status after amdgpu_ras_late_init called 2. change to enable watchdog timer after finishing gfx edc init Signed-off-by: Dennis Li Reivewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 8e0a6c62322e..689addb1520d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -601,6 +601,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev) struct ras_ih_if ih_info = { .cb = amdgpu_gfx_process_ras_data_cb, }; + struct ras_query_if info = { 0 }; if (!adev->gfx.ras_if) { adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); @@ -612,13 +613,19 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev) strcpy(adev->gfx.ras_if->name, "gfx"); } fs_info.head = ih_info.head = *adev->gfx.ras_if; - r = amdgpu_ras_late_init(adev, adev->gfx.ras_if, &fs_info, &ih_info); if (r) goto free; if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) { + if (adev->gmc.xgmi.connected_to_cpu) { + info.head = *adev->gfx.ras_if; + amdgpu_ras_query_error_status(adev, &info); + } else { + amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX); + } + r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); if (r) goto late_fini; -- cgit