diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 682 |
1 files changed, 415 insertions, 267 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8f47c14ecbc7..7e126dff004f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -66,6 +66,8 @@ const char *ras_block_string[] = { "mp1", "fuse", "mca", + "vcn", + "jpeg", }; const char *ras_mca_block_string[] = { @@ -75,6 +77,13 @@ const char *ras_mca_block_string[] = { "mca_iohc", }; +struct amdgpu_ras_block_list { + /* ras block link */ + struct list_head node; + + struct amdgpu_ras_block_object *ras_obj; +}; + const char *get_ras_block_str(struct ras_common_if *ras_block) { if (!ras_block) @@ -89,6 +98,9 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) return ras_block_string[ras_block->block]; } +#define ras_block_str(_BLOCK_) \ + (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range") + #define ras_err_str(i) (ras_error_string[ffs(i)]) #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) @@ -155,14 +167,9 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre } memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - - err_rec.address = address; - err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; - err_rec.ts = (uint64_t)ktime_get_real_seconds(); - err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; - err_data.err_addr = &err_rec; - err_data.err_addr_cnt = 1; + amdgpu_umc_fill_error_record(&err_data, address, + (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -452,7 +459,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, } if (ret) - return -EINVAL; + return ret; return size; } @@ -866,30 +873,47 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, } /* feature ctl end */ +static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, + enum amdgpu_ras_block block) +{ + if (!block_obj) + return -EINVAL; + + if (block_obj->ras_comm.block == block) + return 0; + + return -EINVAL; +} -static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_err_data *err_data) +static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint32_t sub_block_index) { - switch (ras_block->sub_block_index) { - case AMDGPU_RAS_MCA_BLOCK__MP0: - if (adev->mca.mp0.ras_funcs && - adev->mca.mp0.ras_funcs->query_ras_error_count) - adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_MCA_BLOCK__MP1: - if (adev->mca.mp1.ras_funcs && - adev->mca.mp1.ras_funcs->query_ras_error_count) - adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_MCA_BLOCK__MPIO: - if (adev->mca.mpio.ras_funcs && - adev->mca.mpio.ras_funcs->query_ras_error_count) - adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data); - break; - default: - break; + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; + + if (block >= AMDGPU_RAS_BLOCK__LAST) + return NULL; + + if (!amdgpu_ras_is_supported(adev, block)) + return NULL; + + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + + obj = node->ras_obj; + if (obj->ras_block_match) { + if (obj->ras_block_match(obj, block, sub_block_index) == 0) + return obj; + } else { + if (amdgpu_ras_block_match_default(obj, block) == 0) + return obj; + } } + + return NULL; } static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) @@ -901,26 +925,26 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d * choosing right query method according to * whether smu support query error information */ - ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc)); + ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); if (ret == -EOPNOTSUPP) { - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_count) - adev->umc.ras_funcs->query_ras_error_count(adev, err_data); + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_count) + adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_error_address) - adev->umc.ras_funcs->query_ras_error_address(adev, err_data); + if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && + adev->umc.ras->ras_block.hw_ops->query_ras_error_address) + adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); } else if (!ret) { - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->ecc_info_query_ras_error_count) - adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data); + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_count) + adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); - if (adev->umc.ras_funcs && - adev->umc.ras_funcs->ecc_info_query_ras_error_address) - adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data); + if (adev->umc.ras && + adev->umc.ras->ecc_info_query_ras_error_address) + adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); } } @@ -928,62 +952,32 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { + struct amdgpu_ras_block_object *block_obj = NULL; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; - int i; if (!obj) return -EINVAL; - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__UMC: + if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { amdgpu_ras_get_ecc_info(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__SDMA: - if (adev->sdma.funcs->query_ras_error_count) { - for (i = 0; i < adev->sdma.num_instances; i++) - adev->sdma.funcs->query_ras_error_count(adev, i, - &err_data); + } else { + block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return -EINVAL; } - break; - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_count) - adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); - - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_status) - adev->gfx.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_count) - adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); - - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_status) - adev->mmhub.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__PCIE_BIF: - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->query_ras_error_count) - adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__XGMI_WAFL: - if (adev->gmc.xgmi.ras_funcs && - adev->gmc.xgmi.ras_funcs->query_ras_error_count) - adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__HDP: - if (adev->hdp.ras_funcs && - adev->hdp.ras_funcs->query_ras_error_count) - adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); - break; - case AMDGPU_RAS_BLOCK__MCA: - amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data); - break; - default: - break; + + if (block_obj->hw_ops->query_ras_error_count) + block_obj->hw_ops->query_ras_error_count(adev, &err_data); + + if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || + (info->head.block == AMDGPU_RAS_BLOCK__GFX) || + (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { + if (block_obj->hw_ops->query_ras_error_status) + block_obj->hw_ops->query_ras_error_status(adev); + } } obj->err_data.ue_count += err_data.ue_count; @@ -1040,68 +1034,27 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + if (!amdgpu_ras_is_supported(adev, block)) return -EINVAL; - switch (block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->reset_ras_error_count) - adev->gfx.ras_funcs->reset_ras_error_count(adev); - - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->reset_ras_error_status) - adev->gfx.ras_funcs->reset_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->reset_ras_error_count) - adev->mmhub.ras_funcs->reset_ras_error_count(adev); - - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->reset_ras_error_status) - adev->mmhub.ras_funcs->reset_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__SDMA: - if (adev->sdma.funcs->reset_ras_error_count) - adev->sdma.funcs->reset_ras_error_count(adev); - break; - case AMDGPU_RAS_BLOCK__HDP: - if (adev->hdp.ras_funcs && - adev->hdp.ras_funcs->reset_ras_error_count) - adev->hdp.ras_funcs->reset_ras_error_count(adev); - break; - default: - break; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + ras_block_str(block)); + return -EINVAL; } - return 0; -} - -/* Trigger XGMI/WAFL error */ -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, - struct ta_ras_trigger_error_input *block_info) -{ - int ret; - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) - dev_warn(adev->dev, "Failed to disallow df cstate"); - - if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) - dev_warn(adev->dev, "Failed to disallow XGMI power down"); - - ret = psp_ras_trigger_error(&adev->psp, block_info); + if (block_obj->hw_ops->reset_ras_error_count) + block_obj->hw_ops->reset_ras_error_count(adev); - if (amdgpu_ras_intr_triggered()) - return ret; - - if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) - dev_warn(adev->dev, "Failed to allow XGMI power down"); - - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) - dev_warn(adev->dev, "Failed to allow df cstate"); + if ((block == AMDGPU_RAS_BLOCK__GFX) || + (block == AMDGPU_RAS_BLOCK__MMHUB)) { + if (block_obj->hw_ops->reset_ras_error_status) + block_obj->hw_ops->reset_ras_error_status(adev); + } - return ret; + return 0; } /* wrapper of psp_ras_trigger_error */ @@ -1116,11 +1069,20 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, .address = info->address, .value = info->value, }; - int ret = 0; + int ret = -EINVAL; + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); if (!obj) return -EINVAL; + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return -EINVAL; + } + /* Calculate XGMI relative offset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { block_info.address = @@ -1128,28 +1090,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, block_info.address); } - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->ras_error_inject) - ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); - else - ret = -EINVAL; - break; - case AMDGPU_RAS_BLOCK__UMC: - case AMDGPU_RAS_BLOCK__SDMA: - case AMDGPU_RAS_BLOCK__MMHUB: - case AMDGPU_RAS_BLOCK__PCIE_BIF: - case AMDGPU_RAS_BLOCK__MCA: - ret = psp_ras_trigger_error(&adev->psp, &block_info); - break; - case AMDGPU_RAS_BLOCK__XGMI_WAFL: - ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); - break; - default: - dev_info(adev->dev, "%s error injection is not supported yet\n", - get_ras_block_str(&info->head)); - ret = -EINVAL; + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { + if (block_obj->hw_ops->ras_error_inject) + ret = block_obj->hw_ops->ras_error_inject(adev, info); + } else { + /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ + if (block_obj->hw_ops->ras_error_inject) + ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); + else /*If not defined .ras_error_inject, use default ras_error_inject*/ + ret = psp_ras_trigger_error(&adev->psp, &block_info); } if (ret) @@ -1329,18 +1278,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) } int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, - struct ras_fs_if *head) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); if (!obj || obj->attr_inuse) return -EINVAL; get_obj(obj); - memcpy(obj->fs_data.sysfs_name, - head->sysfs_name, - sizeof(obj->fs_data.sysfs_name)); + snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), + "%s_err_count", head->name); obj->sysfs_attr = (struct device_attribute){ .attr = { @@ -1567,12 +1515,97 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ + +/* For the hardware that cannot enable bif ring for both ras_controller_irq + * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status + * register to check whether the interrupt is triggered or not, and properly + * ack the interrupt if it is there + */ +void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) + return; + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_controller_intr_no_bifring) + adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); +} + +static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + bool poison_stat = true, need_reset = true; + struct amdgpu_device *adev = obj->adev; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras_block_object *block_obj = + amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_poison_handler(adev, &err_data, false); + + /* both query_poison_status and handle_poison_consumption are optional */ + if (block_obj && block_obj->hw_ops) { + if (block_obj->hw_ops->query_poison_status) { + poison_stat = block_obj->hw_ops->query_poison_status(adev); + if (!poison_stat) + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", + block_obj->ras_comm.name); + } + + if (poison_stat && block_obj->hw_ops->handle_poison_consumption) { + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); + need_reset = poison_stat; + } + } + + /* gpu reset is fallback for all failed cases */ + if (need_reset) + amdgpu_ras_reset_gpu(adev); +} + +static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + dev_info(obj->adev->dev, + "Poison is created, no user action is needed.\n"); +} + +static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + struct ras_ih_data *data = &obj->ih_data; + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret; + + if (!data->cb) + return; + + /* Let IP handle its data, maybe we need get the output + * from the callback to update the error type/count, etc + */ + ret = data->cb(obj->adev, &err_data, entry); + /* ue will trigger an interrupt, and in that case + * we need do a reset to recovery the whole system. + * But leave IP do that recovery, here we just dispatch + * the error. + */ + if (ret == AMDGPU_RAS_SUCCESS) { + /* these counts could be left as 0 if + * some blocks do not count error number + */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } +} + static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) { struct ras_ih_data *data = &obj->ih_data; struct amdgpu_iv_entry entry; - int ret; - struct ras_err_data err_data = {0, 0, 0, NULL}; while (data->rptr != data->wptr) { rmb(); @@ -1583,30 +1616,17 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) data->rptr = (data->aligned_element_size + data->rptr) % data->ring_size; - if (data->cb) { - if (amdgpu_ras_is_poison_mode_supported(obj->adev) && - obj->head.block == AMDGPU_RAS_BLOCK__UMC) - dev_info(obj->adev->dev, - "Poison is created, no user action is needed.\n"); - else { - /* Let IP handle its data, maybe we need get the output - * from the callback to udpate the error type/count, etc - */ - memset(&err_data, 0, sizeof(err_data)); - ret = data->cb(obj->adev, &err_data, &entry); - /* ue will trigger an interrupt, and in that case - * we need do a reset to recovery the whole system. - * But leave IP do that recovery, here we just dispatch - * the error. - */ - if (ret == AMDGPU_RAS_SUCCESS) { - /* these counts could be left as 0 if - * some blocks do not count error number - */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; - } - } + if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + else + amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); + } else { + if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) + amdgpu_ras_interrupt_umc_handler(obj, &entry); + else + dev_warn(obj->adev->dev, + "No RAS interrupt handler for non-UMC block with poison disabled.\n"); } } } @@ -1647,9 +1667,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; if (!obj) @@ -1669,24 +1689,27 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; + struct amdgpu_ras_block_object *ras_obj; if (!obj) { /* in case we registe the IH before enable ras feature */ - obj = amdgpu_ras_create_obj(adev, &info->head); + obj = amdgpu_ras_create_obj(adev, head); if (!obj) return -EINVAL; } else get_obj(obj); + ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); + data = &obj->ih_data; /* add the callback.etc */ *data = (struct ras_ih_data) { .inuse = 0, - .cb = info->cb, + .cb = ras_obj->ras_cb, .element_size = sizeof(struct amdgpu_iv_entry), .rptr = 0, .wptr = 0, @@ -1715,10 +1738,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) struct ras_manager *obj, *tmp; list_for_each_entry_safe(obj, tmp, &con->head, node) { - struct ras_ih_if info = { - .head = obj->head, - }; - amdgpu_ras_interrupt_remove_handler(adev, &info); + amdgpu_ras_interrupt_remove_handler(adev, &obj->head); } return 0; @@ -1766,24 +1786,28 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, struct ras_query_if *info) { + struct amdgpu_ras_block_object *block_obj; /* * Only two block need to query read/write * RspStatus at current state */ - switch (info->head.block) { - case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.ras_funcs && - adev->gfx.ras_funcs->query_ras_error_status) - adev->gfx.ras_funcs->query_ras_error_status(adev); - break; - case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.ras_funcs && - adev->mmhub.ras_funcs->query_ras_error_status) - adev->mmhub.ras_funcs->query_ras_error_status(adev); - break; - default: - break; + if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && + (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) + return; + + block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); + + if (!block_obj || !block_obj->hw_ops) { + dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", + get_ras_block_str(&info->head)); + return; } + + if (block_obj->hw_ops->query_ras_error_status) + block_obj->hw_ops->query_ras_error_status(adev); + } static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) @@ -1897,7 +1921,6 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); if (!bps) { - kfree(bps); return -ENOMEM; } @@ -2118,6 +2141,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); @@ -2141,8 +2165,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) if (ret) goto free; - if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) - adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); + amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } #ifdef CONFIG_X86_MCE_AMD @@ -2250,6 +2278,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) dev_info(adev->dev, "SRAM ECC is active.\n"); adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); + + if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } @@ -2336,6 +2371,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->update_channel_flag = false; con->features = 0; INIT_LIST_HEAD(&con->head); /* Might need get this flag from vbios. */ @@ -2348,24 +2384,27 @@ int amdgpu_ras_init(struct amdgpu_device *adev) case CHIP_VEGA20: case CHIP_ARCTURUS: case CHIP_ALDEBARAN: - if (!adev->gmc.xgmi.connected_to_cpu) - adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; + if (!adev->gmc.xgmi.connected_to_cpu) { + adev->nbio.ras = &nbio_v7_4_ras; + amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block); + adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm; + } break; default: /* nbio ras is not available */ break; } - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->init_ras_controller_interrupt) { - r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); + if (adev->nbio.ras && + adev->nbio.ras->init_ras_controller_interrupt) { + r = adev->nbio.ras->init_ras_controller_interrupt(adev); if (r) goto release_con; } - if (adev->nbio.ras_funcs && - adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { - r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); + if (adev->nbio.ras && + adev->nbio.ras->init_ras_err_event_athub_interrupt) { + r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); if (r) goto release_con; } @@ -2377,12 +2416,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev) } else if (adev->df.funcs && adev->df.funcs->query_ras_poison_mode && - adev->umc.ras_funcs && - adev->umc.ras_funcs->query_ras_poison_mode) { + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { df_poison = adev->df.funcs->query_ras_poison_mode(adev); umc_poison = - adev->umc.ras_funcs->query_ras_poison_mode(adev); + adev->umc.ras->query_ras_poison_mode(adev); /* Only poison is set in both DF and UMC, we can support it */ if (df_poison && umc_poison) con->poison_supported = true; @@ -2445,11 +2484,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) } /* helper function to handle common stuff in ip late init phase */ -int amdgpu_ras_late_init(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_fs_if *fs_info, - struct ras_ih_if *ih_info) +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block) { + struct amdgpu_ras_block_object *ras_obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); unsigned long ue_count, ce_count; int r; @@ -2477,15 +2515,16 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; - if (ih_info->cb) { - r = amdgpu_ras_interrupt_add_handler(adev, ih_info); + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) { + r = amdgpu_ras_interrupt_add_handler(adev, ras_block); if (r) - goto interrupt; + goto cleanup; } - r = amdgpu_ras_sysfs_create(adev, fs_info); + r = amdgpu_ras_sysfs_create(adev, ras_block); if (r) - goto sysfs; + goto interrupt; /* Those are the cached values at init. */ @@ -2495,27 +2534,40 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } return 0; -cleanup: - amdgpu_ras_sysfs_remove(adev, ras_block); -sysfs: - if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + interrupt: + if (ras_obj->ras_cb) + amdgpu_ras_interrupt_remove_handler(adev, ras_block); +cleanup: amdgpu_ras_feature_enable(adev, ras_block, 0); return r; } +static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + return amdgpu_ras_block_late_init(adev, ras_block); +} + /* helper function to remove ras fs node and interrupt handler */ -void amdgpu_ras_late_fini(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_ih_if *ih_info) +void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) { - if (!ras_block || !ih_info) + struct amdgpu_ras_block_object *ras_obj; + if (!ras_block) return; amdgpu_ras_sysfs_remove(adev, ras_block); - if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) + amdgpu_ras_interrupt_remove_handler(adev, ras_block); +} + +static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + return amdgpu_ras_block_late_fini(adev, ras_block); } /* do some init work after IP late init as dependence. @@ -2568,6 +2620,33 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) amdgpu_ras_disable_all_features(adev, 1); } +int amdgpu_ras_late_init(struct amdgpu_device *adev) +{ + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; + int r; + + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + + obj = node->ras_obj; + if (obj->ras_late_init) { + r = obj->ras_late_init(adev, &obj->ras_comm); + if (r) { + dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", + obj->ras_comm.name, r); + return r; + } + } else + amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); + } + + return 0; +} + /* do some fini work before IP fini as dependence */ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { @@ -2585,11 +2664,28 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) int amdgpu_ras_fini(struct amdgpu_device *adev) { + struct amdgpu_ras_block_list *ras_node, *tmp; + struct amdgpu_ras_block_object *obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); if (!adev->ras_enabled || !con) return 0; + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { + if (ras_node->ras_obj) { + obj = ras_node->ras_obj; + if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && + obj->ras_fini) + obj->ras_fini(adev, &obj->ras_comm); + else + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); + } + + /* Clear ras blocks from ras_list and free ras block list node */ + list_del(&ras_node->node); + kfree(ras_node); + } + amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); @@ -2717,8 +2813,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", umc_inst, ch_inst); - memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); - /* * Translate UMC channel address to Physical address */ @@ -2730,16 +2824,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb, ADDR_OF_256B_BLOCK(channel_index) | OFFSET_IN_256B_BLOCK(m->addr); - err_rec.address = m->addr; - err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; - err_rec.ts = (uint64_t)ktime_get_real_seconds(); - err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; - err_rec.cu = 0; - err_rec.mem_channel = channel_index; - err_rec.mcumc_id = umc_inst; - + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); err_data.err_addr = &err_rec; - err_data.err_addr_cnt = 1; + amdgpu_umc_fill_error_record(&err_data, m->addr, + retired_page, channel_index, umc_inst); if (amdgpu_bad_page_threshold != 0) { amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -2777,3 +2865,63 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) } } #endif + +struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) +{ + if (!adev) + return NULL; + + return adev->psp.ras_context.ras; +} + +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) +{ + if (!adev) + return -EINVAL; + + adev->psp.ras_context.ras = ras_con; + return 0; +} + +/* check if ras is supported on block, say, sdma, gfx */ +int amdgpu_ras_is_supported(struct amdgpu_device *adev, + unsigned int block) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (block >= AMDGPU_RAS_BLOCK_COUNT) + return 0; + return ras && (adev->ras_enabled & (1 << block)); +} + +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + schedule_work(&ras->recovery_work); + return 0; +} + + +/* Register each ip ras block into amdgpu ras */ +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, + struct amdgpu_ras_block_object *ras_block_obj) +{ + struct amdgpu_ras_block_list *ras_node; + if (!adev || !ras_block_obj) + return -EINVAL; + + if (!amdgpu_ras_asic_supported(adev)) + return 0; + + ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); + if (!ras_node) + return -ENOMEM; + + INIT_LIST_HEAD(&ras_node->node); + ras_node->ras_obj = ras_block_obj; + list_add_tail(&ras_node->node, &adev->ras_list); + + return 0; +} |