From 75ac6a250632d2fff62039ae728c842033dceddb Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 25 Jun 2024 14:23:42 +0800 Subject: drm/amdgpu: refine amdgpu ras event id core code v1: - use unified event id to manage ras events - add a new function amdgpu_ras_query_error_status_with_event() to accept event type as parameter. v2: add a warn log to show the location of function failure when calling amdgpu_ras_mark_event(). (Tao Zhou) v3: change RAS_EVENT_TYPE_ISR to RAS_EVENT_TYPE_FATAL. v4: rename amdgpu_ras_get_recovery_event() to amdgpu_ras_get_fatal_error_event(). Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 18d994c98a25..7c20def1c4f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -68,9 +68,15 @@ struct amdgpu_iv_entry; /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) +#define RAS_EVENT_INVALID_ID (BIT_ULL(63)) +#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63))) + #define RAS_EVENT_LOG(adev, id, fmt, ...) \ amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__) +#define amdgpu_ras_mark_ras_event(adev, type) \ + (amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0))) + enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__SDMA, @@ -427,20 +433,25 @@ struct umc_ecc_info { }; enum ras_event_type { - RAS_EVENT_TYPE_INVALID = -1, - RAS_EVENT_TYPE_ISR = 0, + RAS_EVENT_TYPE_INVALID = 0, + RAS_EVENT_TYPE_FATAL, RAS_EVENT_TYPE_COUNT, }; struct ras_event_manager { - atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; + atomic64_t seqno; + u64 last_seqno[RAS_EVENT_TYPE_COUNT]; }; -struct ras_query_context { +struct ras_event_id { enum ras_event_type type; u64 event_id; }; +struct ras_query_context { + struct ras_event_id evid; +}; + typedef int (*pasid_notify)(struct amdgpu_device *adev, uint16_t pasid, void *data); @@ -947,8 +958,9 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id); u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); +int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, + const void *caller); int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); -- cgit