diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 120 |
1 files changed, 77 insertions, 43 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2de9309a4193..ccebd8e2a2d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,6 +35,8 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "atom.h" +#include "amdgpu_reset.h" + #ifdef CONFIG_X86_MCE_AMD #include <asm/mce.h> @@ -197,6 +199,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; + /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ + if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) + dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); + } + s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", "ue", info.ue_count, "ce", info.ce_count); @@ -550,9 +559,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - if (obj->adev->asic_type == CHIP_ALDEBARAN) { + if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) - DRM_WARN("Failed to reset error counter and error status"); + dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); } return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, @@ -707,27 +717,30 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!con) return -EINVAL; - info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); - if (!info) - return -ENOMEM; + if (head->block == AMDGPU_RAS_BLOCK__GFX) { + info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); + if (!info) + return -ENOMEM; - if (!enable) { - info->disable_features = (struct ta_ras_disable_features_input) { - .block_id = amdgpu_ras_block_to_ta(head->block), - .error_type = amdgpu_ras_error_to_ta(head->type), - }; - } else { - info->enable_features = (struct ta_ras_enable_features_input) { - .block_id = amdgpu_ras_block_to_ta(head->block), - .error_type = amdgpu_ras_error_to_ta(head->type), - }; + if (!enable) { + info->disable_features = (struct ta_ras_disable_features_input) { + .block_id = amdgpu_ras_block_to_ta(head->block), + .error_type = amdgpu_ras_error_to_ta(head->type), + }; + } else { + info->enable_features = (struct ta_ras_enable_features_input) { + .block_id = amdgpu_ras_block_to_ta(head->block), + .error_type = amdgpu_ras_error_to_ta(head->type), + }; + } } /* Do not enable if it is not allowed. */ WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); /* Only enable ras feature operation handle on host side */ - if (!amdgpu_sriov_vf(adev) && + if (head->block == AMDGPU_RAS_BLOCK__GFX && + !amdgpu_sriov_vf(adev) && !amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { @@ -743,7 +756,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, __amdgpu_ras_feature_enable(adev, head, enable); ret = 0; out: - kfree(info); + if (head->block == AMDGPU_RAS_BLOCK__GFX) + kfree(info); return ret; } @@ -1027,9 +1041,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } - if (!amdgpu_persistent_edc_harvesting_supported(adev)) - amdgpu_ras_reset_error_status(adev, info->head.block); - return 0; } @@ -1149,6 +1160,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, if (res) return res; + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(adev, info.head.block)) + dev_warn(adev->dev, "Failed to reset error counter and error status"); + } + ce += info.ce_count; ue += info.ue_count; } @@ -1792,6 +1809,13 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) continue; amdgpu_ras_query_error_status(adev, &info); + + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { + if (amdgpu_ras_reset_error_status(adev, info.head.block)) + dev_warn(adev->dev, "Failed to reset error counter and error status"); + } } } @@ -1919,8 +1943,17 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_put_xgmi_hive(hive); } - if (amdgpu_device_should_recover_gpu(ras->adev)) - amdgpu_device_gpu_recover(ras->adev, NULL); + if (amdgpu_device_should_recover_gpu(ras->adev)) { + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); + } atomic_set(&ras->in_recovery, 0); } @@ -2131,7 +2164,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) bool exc_err_limit = false; int ret; - if (!con) + if (!con || amdgpu_sriov_vf(adev)) return 0; /* Allow access to RAS EEPROM via debugfs, when the ASIC @@ -2278,8 +2311,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) !amdgpu_ras_asic_supported(adev)) return; - if (!(amdgpu_sriov_vf(adev) && - (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))) + /* If driver run on sriov guest side, only enable ras for aldebaran */ + if (amdgpu_sriov_vf(adev) && + adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2)) return; if (!adev->gmc.xgmi.connected_to_cpu) { @@ -2686,7 +2720,8 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) /* Need disable ras on all IPs here before ip [hw/sw]fini */ - amdgpu_ras_disable_all_features(adev, 0); + if (con->features) + amdgpu_ras_disable_all_features(adev, 0); amdgpu_ras_recovery_fini(adev); return 0; } @@ -2799,11 +2834,8 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, struct mce *m = (struct mce *)data; struct amdgpu_device *adev = NULL; uint32_t gpu_id = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst, channel_index = 0; + uint32_t umc_inst = 0, ch_inst = 0; struct ras_err_data err_data = {0, 0, 0, NULL}; - struct eeprom_table_record err_rec; - uint64_t retired_page; /* * If the error was generated in UMC_V2, which belongs to GPU UMCs, @@ -2842,21 +2874,22 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", umc_inst, ch_inst); + err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if(!err_data.err_addr) { + dev_warn(adev->dev, "Failed to alloc memory for " + "umc error address record in mca notifier!\n"); + return NOTIFY_DONE; + } + /* * Translate UMC channel address to Physical address */ - channel_index = - adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num - + ch_inst]; - - retired_page = ADDR_OF_8KB_BLOCK(m->addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(m->addr); - - memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - err_data.err_addr = &err_rec; - amdgpu_umc_fill_error_record(&err_data, m->addr, - retired_page, channel_index, umc_inst); + if (adev->umc.ras && + adev->umc.ras->convert_ras_error_address) + adev->umc.ras->convert_ras_error_address(adev, + &err_data, 0, ch_inst, umc_inst, m->addr); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -2864,6 +2897,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, amdgpu_ras_save_bad_pages(adev); } + kfree(err_data.err_addr); return NOTIFY_OK; } @@ -2928,7 +2962,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) - schedule_work(&ras->recovery_work); + amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); return 0; } |