From 1f3ef0efbacb2aa63e8e3933664192ee27d0d95b Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 10 Apr 2020 11:41:54 +0800 Subject: drm/amdgpu: add uncorrectable error count print in UMC ecc irq cb Uncorrectable error count printing is missed when issuing UMC UE injection. When going to the error count log function in GPU recover work thread, there is no chance to get correct error count value by last error injection and print, because the error status register is automatically cleared after reading in UMC ecc irq callback. So add such message printing in UMC ecc irq cb to be consistent with other RAS error interrupt cases. Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9dd51f0d2c11..8ffa015bc1dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -120,6 +120,9 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, /* only uncorrectable error needs gpu reset */ if (err_data->ue_count) { + dev_info(adev->dev, "%ld uncorrectable errors detected in UMC block\n", + err_data->ue_count); + if (err_data->err_addr_cnt && amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt)) -- cgit From 6952e99cfd52d32098540fe8d9e592828b9e774c Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 10 Apr 2020 15:51:14 +0800 Subject: drm/amdgpu: refine ras related message print Prefix ras related kernel message logging with PCI device info by replacing DRM_INFO/WARN/ERROR with dev_info/warn/err. This can clearly tell user about GPU device information where ras is. And add some other ras message printing to make it more clear and friendly as well. Suggested-by: Hawking Zhang Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 51 ++++++++++++++++++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 10 ++++--- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 +++++++---- 3 files changed, 48 insertions(+), 29 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0aa4e1ed4df..73ae913aee26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -296,7 +296,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * int ret = 0; if (!amdgpu_ras_get_error_query_ready(adev)) { - DRM_WARN("RAS WARN: error injection currently inaccessible\n"); + dev_warn(adev->dev, "RAS WARN: error injection " + "currently inaccessible\n"); return size; } @@ -324,7 +325,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* umc ce/ue error injection for a bad page is not allowed */ if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && amdgpu_ras_check_bad_page(adev, data.inject.address)) { - DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n", + dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked " + "as bad before error injection!\n", data.inject.address); break; } @@ -590,7 +592,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, &info, enable); if (ret) { - DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", + dev_err(adev->dev, "RAS ERROR: %s %s feature " + "failed ret %d\n", enable ? "enable":"disable", ras_block_str(head->block), ret); @@ -632,7 +635,8 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (ret == -EINVAL) { ret = __amdgpu_ras_feature_enable(adev, head, 1); if (!ret) - DRM_INFO("RAS INFO: %s setup object\n", + dev_info(adev->dev, + "RAS INFO: %s setup object\n", ras_block_str(head->block)); } } else { @@ -758,12 +762,17 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - dev_info(adev->dev, "%ld correctable errors detected in %s block\n", - obj->err_data.ce_count, ras_block_str(info->head.block)); + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + obj->err_data.ce_count, + ras_block_str(info->head.block)); } if (err_data.ue_count) { - dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", - obj->err_data.ue_count, ras_block_str(info->head.block)); + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + obj->err_data.ue_count, + ras_block_str(info->head.block)); } return 0; @@ -807,13 +816,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = psp_ras_trigger_error(&adev->psp, &block_info); break; default: - DRM_INFO("%s error injection is not supported yet\n", + dev_info(adev->dev, "%s error injection is not supported yet\n", ras_block_str(info->head.block)); ret = -EINVAL; } if (ret) - DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", + dev_err(adev->dev, "RAS ERROR: inject %s error failed ret %d\n", ras_block_str(info->head.block), ret); @@ -1549,7 +1558,7 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) &data->bps[control->num_recs], true, save_count)) { - DRM_ERROR("Failed to save EEPROM table data!"); + dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } @@ -1577,7 +1586,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (amdgpu_ras_eeprom_process_recods(control, bps, false, control->num_recs)) { - DRM_ERROR("Failed to load EEPROM table records!"); + dev_err(adev->dev, "Failed to load EEPROM table records!"); ret = -EIO; goto out; } @@ -1651,7 +1660,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) AMDGPU_GPU_PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, &bo, NULL)) - DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); + dev_warn(adev->dev, "RAS WARN: reserve vram for " + "retired page %llx fail\n", bp); data->bps_bo[i] = bo; data->last_reserved = i + 1; @@ -1739,7 +1749,7 @@ free: kfree(*data); con->eh_data = NULL; out: - DRM_WARN("Failed to initialize ras recovery!\n"); + dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); return ret; } @@ -1801,18 +1811,18 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, return; if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { - DRM_INFO("HBM ECC is active.\n"); + dev_info(adev->dev, "HBM ECC is active.\n"); *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else - DRM_INFO("HBM ECC is not presented.\n"); + dev_info(adev->dev, "HBM ECC is not presented.\n"); if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { - DRM_INFO("SRAM ECC is active.\n"); + dev_info(adev->dev, "SRAM ECC is active.\n"); *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else - DRM_INFO("SRAM ECC is not presented.\n"); + dev_info(adev->dev, "SRAM ECC is not presented.\n"); /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; @@ -1869,7 +1879,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (amdgpu_ras_fs_init(adev)) goto fs_out; - DRM_INFO("RAS INFO: ras initialized successfully, " + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); return 0; @@ -2055,7 +2065,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + dev_info(adev->dev, "uncorrectable hardware error" + "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 8ffa015bc1dc..af1b1ccf613c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -110,7 +110,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, * even NOMEM error is encountered */ if(!err_data->err_addr) - DRM_WARN("Failed to alloc memory for umc error address record!\n"); + dev_warn(adev->dev, "Failed to alloc memory for " + "umc error address record!\n"); /* umc query_ras_error_address is also responsible for clearing * error status @@ -120,13 +121,14 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, /* only uncorrectable error needs gpu reset */ if (err_data->ue_count) { - dev_info(adev->dev, "%ld uncorrectable errors detected in UMC block\n", - err_data->ue_count); + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in UMC block\n", + err_data->ue_count); if (err_data->err_addr_cnt && amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt)) - DRM_WARN("Failed to add ras bad page!\n"); + dev_warn(adev->dev, "Failed to add ras bad page!\n"); amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 72bdd06ed508..e629156173d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -323,14 +323,20 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device obj->err_data.ce_count += err_data.ce_count; if (err_data.ce_count) - DRM_INFO("%ld correctable errors detected in %s block\n", - obj->err_data.ce_count, adev->nbio.ras_if->name); + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + adev->nbio.ras_if->name); if (err_data.ue_count) - DRM_INFO("%ld uncorrectable errors detected in %s block\n", - obj->err_data.ue_count, adev->nbio.ras_if->name); + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + adev->nbio.ras_if->name); - DRM_WARN("RAS controller interrupt triggered by NBIF error\n"); + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); /* ras_controller_int is dedicated for nbif ras error, * not the global interrupt for sync flood -- cgit