diff options
| author | Russell King (Oracle) <[email protected]> | 2024-06-10 12:03:21 +0100 |
|---|---|---|
| committer | Russell King (Oracle) <[email protected]> | 2024-06-10 12:03:21 +0100 |
| commit | 594ce0b8a998aa4d05827cd7c0d0dcec9a1e3ae2 (patch) | |
| tree | 070bd60a8fda15e5f47339d3f6888a0fe2ca6fe9 /drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | |
| parent | 616501eccb58615f8f352a29239ea6c6fc5e6546 (diff) | |
| parent | e3cf20e5c68df604315ab30bdbe15dc8a5da556b (diff) | |
Merge topic branches 'clkdev' and 'fixes' into for-linus
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 53 |
1 files changed, 36 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 59fafb8392e0..0734490347db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -27,6 +27,16 @@ #include "umc/umc_6_7_0_offset.h" #include "umc/umc_6_7_0_sh_mask.h" +static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev, + uint64_t mc_status) +{ + if (adev->umc.ras->check_ecc_err_status) + return adev->umc.ras->check_ecc_err_status(adev, + AMDGPU_MCA_ERROR_TYPE_DE, &mc_status); + + return false; +} + void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) @@ -200,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) return -EOPNOTSUPP; } -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry) +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, + struct ras_query_context *qctx) { - dev_info(adev->dev, "[Hardware error] Accelerator Check Architecture events logged\n"); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].STATUS=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_STATUS]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].ADDR=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_ADDR]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].MISC0=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_MISC0]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].IPID=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_IPID]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].SYND=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_SYND]); + u64 event_id = qctx->event_id; + + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_STATUS]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_ADDR]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_MISC0]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_IPID]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_SYND]); } -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct ras_err_data *err_data, struct ras_query_context *qctx) { struct amdgpu_smuio_mcm_config_info mcm_info; struct ras_err_addr err_addr = {0}; @@ -234,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo list_for_each_entry(node, &mca_set.list, node) { entry = &node->entry; - amdgpu_mca_smu_mca_bank_dump(adev, i++, entry); + amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx); count = 0; ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); @@ -256,9 +270,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo if (type == AMDGPU_MCA_ERROR_TYPE_UE) amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, &err_addr, (uint64_t)count); - else - amdgpu_ras_error_statistic_ce_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); + else { + if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) + amdgpu_ras_error_statistic_de_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + else + amdgpu_ras_error_statistic_ce_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + } } out_mca_release: |