diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22f401fd1901..57e86af0c906 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3103,3 +3103,122 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, return 0; } + +void amdgpu_ras_get_error_type_name(uint32_t err_type, char *err_type_name) +{ + if (!err_type_name) + return; + + switch (err_type) { + case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: + sprintf(err_type_name, "correctable"); + break; + case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: + sprintf(err_type_name, "uncorrectable"); + break; + default: + sprintf(err_type_name, "unknown"); + break; + } +} + +bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + uint32_t *memory_id) +{ + uint32_t err_status_lo_data, err_status_lo_offset; + + if (!reg_entry) + return false; + + err_status_lo_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_lo, reg_entry->reg_lo); + err_status_lo_data = RREG32(err_status_lo_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && + !REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, ERR_STATUS_VALID_FLAG)) + return false; + + *memory_id = REG_GET_FIELD(err_status_lo_data, ERR_STATUS_LO, MEMORY_ID); + + return true; +} + +bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_entry, + uint32_t instance, + unsigned long *err_cnt) +{ + uint32_t err_status_hi_data, err_status_hi_offset; + + if (!reg_entry) + return false; + + err_status_hi_offset = + AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, + reg_entry->seg_hi, reg_entry->reg_hi); + err_status_hi_data = RREG32(err_status_hi_offset); + + if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && + !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) + return false; + + /* read err count */ + *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); + + return true; +} + +void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, + const struct amdgpu_ras_err_status_reg_entry *reg_list, + uint32_t reg_list_size, + const struct amdgpu_ras_memory_id_entry *mem_list, + uint32_t mem_list_size, + uint32_t instance, + uint32_t err_type, + unsigned long *err_count) +{ + uint32_t memory_id; + unsigned long err_cnt; + char err_type_name[16]; + uint32_t i, j; + + for (i = 0; i < reg_list_size; i++) { + /* query err_cnt from err_status_hi */ + if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], + instance, &err_cnt) || + !err_cnt) + continue; + + /* query memory_id from err_status_lo */ + if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], + instance, &memory_id)) + continue; + + *err_count += err_cnt; + + /* log the errors */ + amdgpu_ras_get_error_type_name(err_type, err_type_name); + if (!mem_list) { + /* memory_list is not supported */ + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory_id: %d\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, memory_id); + } else { + for (j = 0; j < mem_list_size; j++) { + if (memory_id == mem_list[j].memory_id) { + dev_info(adev->dev, + "%ld %s hardware errors detected in %s, instance: %d, memory block: %s\n", + err_cnt, err_type_name, + reg_list[i].block_name, + instance, mem_list[j].name); + break; + } + } + } + } +} |