From 05adfd80cc52e0b4581e65bb5418de5dfd24d105 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Fri, 21 May 2021 11:53:09 -0400 Subject: drm/amdgpu: Use delayed work to collect RAS error counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Context Query2 IOCTL return the correctable and uncorrectable errors in O(1) fashion, from cached values, and schedule a delayed work function to calculate and cache them for the next such IOCTL. v2: Cancel pending delayed work at ras_fini(). v3: Remove conditionals when dealing with delayed work manipulation as they're inherently racy. Cc: Alexander Deucher Cc: Christian König Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Reviewed-by: Alexander Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ed3c43e8b0b5..ec936cde2726 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_ras.h" @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) adev->ras_hw_enabled & amdgpu_ras_mask; } +static void amdgpu_ras_counte_dw(struct work_struct *work) +{ + struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, + ras_counte_delay_work.work); + struct amdgpu_device *adev = con->adev; + struct drm_device *dev = &adev->ddev; + unsigned long ce_count, ue_count; + int res; + + res = pm_runtime_get_sync(dev->dev); + if (res < 0) + goto Out; + + /* Cache new values. + */ + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); + atomic_set(&con->ras_ce_count, ce_count); + atomic_set(&con->ras_ue_count, ue_count); + + pm_runtime_mark_last_busy(dev->dev); +Out: + pm_runtime_put_autosuspend(dev->dev); +} + int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (!con) return -ENOMEM; + con->adev = adev; + INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); + atomic_set(&con->ras_ce_count, 0); + atomic_set(&con->ras_ue_count, 0); + con->objs = (struct ras_manager *)(con + 1); amdgpu_ras_set_context(adev, con); @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_fs_if *fs_info, struct ras_ih_if *ih_info) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + unsigned long ue_count, ce_count; int r; /* disable RAS feature per IP block if it is not supported */ @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (r) goto sysfs; + /* Those are the cached values at init. + */ + amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); + atomic_set(&con->ras_ce_count, ce_count); + atomic_set(&con->ras_ue_count, ue_count); + return 0; cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); @@ -2390,6 +2428,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (con->features) amdgpu_ras_disable_all_features(adev, 1); + cancel_delayed_work_sync(&con->ras_counte_delay_work); + amdgpu_ras_set_context(adev, NULL); kfree(con); -- cgit