diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 53 |
1 files changed, 36 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 59fafb8392e0..0734490347db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -27,6 +27,16 @@ #include "umc/umc_6_7_0_offset.h" #include "umc/umc_6_7_0_sh_mask.h" +static bool amdgpu_mca_is_deferred_error(struct amdgpu_device *adev, + uint64_t mc_status) +{ + if (adev->umc.ras->check_ecc_err_status) + return adev->umc.ras->check_ecc_err_status(adev, + AMDGPU_MCA_ERROR_TYPE_DE, &mc_status); + + return false; +} + void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) @@ -200,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) return -EOPNOTSUPP; } -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry) +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, + struct ras_query_context *qctx) { - dev_info(adev->dev, "[Hardware error] Accelerator Check Architecture events logged\n"); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].STATUS=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_STATUS]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].ADDR=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_ADDR]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].MISC0=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_MISC0]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].IPID=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_IPID]); - dev_info(adev->dev, "[Hardware error] aca entry[%02d].SYND=0x%016llx\n", - idx, entry->regs[MCA_REG_IDX_SYND]); + u64 event_id = qctx->event_id; + + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_STATUS]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_ADDR]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_MISC0]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_IPID]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", + idx, entry->regs[MCA_REG_IDX_SYND]); } -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct ras_err_data *err_data, struct ras_query_context *qctx) { struct amdgpu_smuio_mcm_config_info mcm_info; struct ras_err_addr err_addr = {0}; @@ -234,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo list_for_each_entry(node, &mca_set.list, node) { entry = &node->entry; - amdgpu_mca_smu_mca_bank_dump(adev, i++, entry); + amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx); count = 0; ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); @@ -256,9 +270,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo if (type == AMDGPU_MCA_ERROR_TYPE_UE) amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, &err_addr, (uint64_t)count); - else - amdgpu_ras_error_statistic_ce_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); + else { + if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) + amdgpu_ras_error_statistic_de_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + else + amdgpu_ras_error_statistic_ce_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + } } out_mca_release: |