aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c682
1 files changed, 415 insertions, 267 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8f47c14ecbc7..7e126dff004f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -66,6 +66,8 @@ const char *ras_block_string[] = {
"mp1",
"fuse",
"mca",
+ "vcn",
+ "jpeg",
};
const char *ras_mca_block_string[] = {
@@ -75,6 +77,13 @@ const char *ras_mca_block_string[] = {
"mca_iohc",
};
+struct amdgpu_ras_block_list {
+ /* ras block link */
+ struct list_head node;
+
+ struct amdgpu_ras_block_object *ras_obj;
+};
+
const char *get_ras_block_str(struct ras_common_if *ras_block)
{
if (!ras_block)
@@ -89,6 +98,9 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
return ras_block_string[ras_block->block];
}
+#define ras_block_str(_BLOCK_) \
+ (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
+
#define ras_err_str(i) (ras_error_string[ffs(i)])
#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
@@ -155,14 +167,9 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
}
memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
-
- err_rec.address = address;
- err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
- err_rec.ts = (uint64_t)ktime_get_real_seconds();
- err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
-
err_data.err_addr = &err_rec;
- err_data.err_addr_cnt = 1;
+ amdgpu_umc_fill_error_record(&err_data, address,
+ (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0);
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
@@ -452,7 +459,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
}
if (ret)
- return -EINVAL;
+ return ret;
return size;
}
@@ -866,30 +873,47 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
}
/* feature ctl end */
+static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
+ enum amdgpu_ras_block block)
+{
+ if (!block_obj)
+ return -EINVAL;
+
+ if (block_obj->ras_comm.block == block)
+ return 0;
+
+ return -EINVAL;
+}
-static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
- struct ras_common_if *ras_block,
- struct ras_err_data *err_data)
+static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint32_t sub_block_index)
{
- switch (ras_block->sub_block_index) {
- case AMDGPU_RAS_MCA_BLOCK__MP0:
- if (adev->mca.mp0.ras_funcs &&
- adev->mca.mp0.ras_funcs->query_ras_error_count)
- adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- case AMDGPU_RAS_MCA_BLOCK__MP1:
- if (adev->mca.mp1.ras_funcs &&
- adev->mca.mp1.ras_funcs->query_ras_error_count)
- adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- case AMDGPU_RAS_MCA_BLOCK__MPIO:
- if (adev->mca.mpio.ras_funcs &&
- adev->mca.mpio.ras_funcs->query_ras_error_count)
- adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- default:
- break;
+ struct amdgpu_ras_block_list *node, *tmp;
+ struct amdgpu_ras_block_object *obj;
+
+ if (block >= AMDGPU_RAS_BLOCK__LAST)
+ return NULL;
+
+ if (!amdgpu_ras_is_supported(adev, block))
+ return NULL;
+
+ list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+ if (!node->ras_obj) {
+ dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
+ continue;
+ }
+
+ obj = node->ras_obj;
+ if (obj->ras_block_match) {
+ if (obj->ras_block_match(obj, block, sub_block_index) == 0)
+ return obj;
+ } else {
+ if (amdgpu_ras_block_match_default(obj, block) == 0)
+ return obj;
+ }
}
+
+ return NULL;
}
static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
@@ -901,26 +925,26 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
* choosing right query method according to
* whether smu support query error information
*/
- ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
+ ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
if (ret == -EOPNOTSUPP) {
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_count)
- adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
+ if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_error_address)
- adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
+ if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
+ adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
} else if (!ret) {
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->ecc_info_query_ras_error_count)
- adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
+ if (adev->umc.ras &&
+ adev->umc.ras->ecc_info_query_ras_error_count)
+ adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
- if (adev->umc.ras_funcs &&
- adev->umc.ras_funcs->ecc_info_query_ras_error_address)
- adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
+ if (adev->umc.ras &&
+ adev->umc.ras->ecc_info_query_ras_error_address)
+ adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
}
}
@@ -928,62 +952,32 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
{
+ struct amdgpu_ras_block_object *block_obj = NULL;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
- int i;
if (!obj)
return -EINVAL;
- switch (info->head.block) {
- case AMDGPU_RAS_BLOCK__UMC:
+ if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_get_ecc_info(adev, &err_data);
- break;
- case AMDGPU_RAS_BLOCK__SDMA:
- if (adev->sdma.funcs->query_ras_error_count) {
- for (i = 0; i < adev->sdma.num_instances; i++)
- adev->sdma.funcs->query_ras_error_count(adev, i,
- &err_data);
+ } else {
+ block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ get_ras_block_str(&info->head));
+ return -EINVAL;
}
- break;
- case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->query_ras_error_count)
- adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
-
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->query_ras_error_status)
- adev->gfx.ras_funcs->query_ras_error_status(adev);
- break;
- case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->query_ras_error_count)
- adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
-
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->query_ras_error_status)
- adev->mmhub.ras_funcs->query_ras_error_status(adev);
- break;
- case AMDGPU_RAS_BLOCK__PCIE_BIF:
- if (adev->nbio.ras_funcs &&
- adev->nbio.ras_funcs->query_ras_error_count)
- adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- case AMDGPU_RAS_BLOCK__XGMI_WAFL:
- if (adev->gmc.xgmi.ras_funcs &&
- adev->gmc.xgmi.ras_funcs->query_ras_error_count)
- adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- case AMDGPU_RAS_BLOCK__HDP:
- if (adev->hdp.ras_funcs &&
- adev->hdp.ras_funcs->query_ras_error_count)
- adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
- break;
- case AMDGPU_RAS_BLOCK__MCA:
- amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
- break;
- default:
- break;
+
+ if (block_obj->hw_ops->query_ras_error_count)
+ block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+
+ if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
+ if (block_obj->hw_ops->query_ras_error_status)
+ block_obj->hw_ops->query_ras_error_status(adev);
+ }
}
obj->err_data.ue_count += err_data.ue_count;
@@ -1040,68 +1034,27 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
enum amdgpu_ras_block block)
{
+ struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
+
if (!amdgpu_ras_is_supported(adev, block))
return -EINVAL;
- switch (block) {
- case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->reset_ras_error_count)
- adev->gfx.ras_funcs->reset_ras_error_count(adev);
-
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->reset_ras_error_status)
- adev->gfx.ras_funcs->reset_ras_error_status(adev);
- break;
- case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->reset_ras_error_count)
- adev->mmhub.ras_funcs->reset_ras_error_count(adev);
-
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->reset_ras_error_status)
- adev->mmhub.ras_funcs->reset_ras_error_status(adev);
- break;
- case AMDGPU_RAS_BLOCK__SDMA:
- if (adev->sdma.funcs->reset_ras_error_count)
- adev->sdma.funcs->reset_ras_error_count(adev);
- break;
- case AMDGPU_RAS_BLOCK__HDP:
- if (adev->hdp.ras_funcs &&
- adev->hdp.ras_funcs->reset_ras_error_count)
- adev->hdp.ras_funcs->reset_ras_error_count(adev);
- break;
- default:
- break;
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ ras_block_str(block));
+ return -EINVAL;
}
- return 0;
-}
-
-/* Trigger XGMI/WAFL error */
-static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
- struct ta_ras_trigger_error_input *block_info)
-{
- int ret;
-
- if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
- dev_warn(adev->dev, "Failed to disallow df cstate");
-
- if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
- dev_warn(adev->dev, "Failed to disallow XGMI power down");
-
- ret = psp_ras_trigger_error(&adev->psp, block_info);
+ if (block_obj->hw_ops->reset_ras_error_count)
+ block_obj->hw_ops->reset_ras_error_count(adev);
- if (amdgpu_ras_intr_triggered())
- return ret;
-
- if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
- dev_warn(adev->dev, "Failed to allow XGMI power down");
-
- if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
- dev_warn(adev->dev, "Failed to allow df cstate");
+ if ((block == AMDGPU_RAS_BLOCK__GFX) ||
+ (block == AMDGPU_RAS_BLOCK__MMHUB)) {
+ if (block_obj->hw_ops->reset_ras_error_status)
+ block_obj->hw_ops->reset_ras_error_status(adev);
+ }
- return ret;
+ return 0;
}
/* wrapper of psp_ras_trigger_error */
@@ -1116,11 +1069,20 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
.address = info->address,
.value = info->value,
};
- int ret = 0;
+ int ret = -EINVAL;
+ struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
+ info->head.block,
+ info->head.sub_block_index);
if (!obj)
return -EINVAL;
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ get_ras_block_str(&info->head));
+ return -EINVAL;
+ }
+
/* Calculate XGMI relative offset */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
block_info.address =
@@ -1128,28 +1090,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
block_info.address);
}
- switch (info->head.block) {
- case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->ras_error_inject)
- ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
- else
- ret = -EINVAL;
- break;
- case AMDGPU_RAS_BLOCK__UMC:
- case AMDGPU_RAS_BLOCK__SDMA:
- case AMDGPU_RAS_BLOCK__MMHUB:
- case AMDGPU_RAS_BLOCK__PCIE_BIF:
- case AMDGPU_RAS_BLOCK__MCA:
- ret = psp_ras_trigger_error(&adev->psp, &block_info);
- break;
- case AMDGPU_RAS_BLOCK__XGMI_WAFL:
- ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
- break;
- default:
- dev_info(adev->dev, "%s error injection is not supported yet\n",
- get_ras_block_str(&info->head));
- ret = -EINVAL;
+ if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, info);
+ } else {
+ /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
+ if (block_obj->hw_ops->ras_error_inject)
+ ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+ else /*If not defined .ras_error_inject, use default ras_error_inject*/
+ ret = psp_ras_trigger_error(&adev->psp, &block_info);
}
if (ret)
@@ -1329,18 +1278,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
}
int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
- struct ras_fs_if *head)
+ struct ras_common_if *head)
{
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
if (!obj || obj->attr_inuse)
return -EINVAL;
get_obj(obj);
- memcpy(obj->fs_data.sysfs_name,
- head->sysfs_name,
- sizeof(obj->fs_data.sysfs_name));
+ snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
+ "%s_err_count", head->name);
obj->sysfs_attr = (struct device_attribute){
.attr = {
@@ -1567,12 +1515,97 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
/* ras fs end */
/* ih begin */
+
+/* For the hardware that cannot enable bif ring for both ras_controller_irq
+ * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
+ * register to check whether the interrupt is triggered or not, and properly
+ * ack the interrupt if it is there
+ */
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+ return;
+
+ if (adev->nbio.ras &&
+ adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+ adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
+
+ if (adev->nbio.ras &&
+ adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+ adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
+}
+
+static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
+ struct amdgpu_iv_entry *entry)
+{
+ bool poison_stat = true, need_reset = true;
+ struct amdgpu_device *adev = obj->adev;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct amdgpu_ras_block_object *block_obj =
+ amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+
+ if (!adev->gmc.xgmi.connected_to_cpu)
+ amdgpu_umc_poison_handler(adev, &err_data, false);
+
+ /* both query_poison_status and handle_poison_consumption are optional */
+ if (block_obj && block_obj->hw_ops) {
+ if (block_obj->hw_ops->query_poison_status) {
+ poison_stat = block_obj->hw_ops->query_poison_status(adev);
+ if (!poison_stat)
+ dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+ block_obj->ras_comm.name);
+ }
+
+ if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
+ poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+ need_reset = poison_stat;
+ }
+ }
+
+ /* gpu reset is fallback for all failed cases */
+ if (need_reset)
+ amdgpu_ras_reset_gpu(adev);
+}
+
+static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
+ struct amdgpu_iv_entry *entry)
+{
+ dev_info(obj->adev->dev,
+ "Poison is created, no user action is needed.\n");
+}
+
+static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
+ struct amdgpu_iv_entry *entry)
+{
+ struct ras_ih_data *data = &obj->ih_data;
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ int ret;
+
+ if (!data->cb)
+ return;
+
+ /* Let IP handle its data, maybe we need get the output
+ * from the callback to update the error type/count, etc
+ */
+ ret = data->cb(obj->adev, &err_data, entry);
+ /* ue will trigger an interrupt, and in that case
+ * we need do a reset to recovery the whole system.
+ * But leave IP do that recovery, here we just dispatch
+ * the error.
+ */
+ if (ret == AMDGPU_RAS_SUCCESS) {
+ /* these counts could be left as 0 if
+ * some blocks do not count error number
+ */
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ }
+}
+
static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
{
struct ras_ih_data *data = &obj->ih_data;
struct amdgpu_iv_entry entry;
- int ret;
- struct ras_err_data err_data = {0, 0, 0, NULL};
while (data->rptr != data->wptr) {
rmb();
@@ -1583,30 +1616,17 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
data->rptr = (data->aligned_element_size +
data->rptr) % data->ring_size;
- if (data->cb) {
- if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
- obj->head.block == AMDGPU_RAS_BLOCK__UMC)
- dev_info(obj->adev->dev,
- "Poison is created, no user action is needed.\n");
- else {
- /* Let IP handle its data, maybe we need get the output
- * from the callback to udpate the error type/count, etc
- */
- memset(&err_data, 0, sizeof(err_data));
- ret = data->cb(obj->adev, &err_data, &entry);
- /* ue will trigger an interrupt, and in that case
- * we need do a reset to recovery the whole system.
- * But leave IP do that recovery, here we just dispatch
- * the error.
- */
- if (ret == AMDGPU_RAS_SUCCESS) {
- /* these counts could be left as 0 if
- * some blocks do not count error number
- */
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
- }
- }
+ if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
+ if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+ amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+ else
+ amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
+ } else {
+ if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+ amdgpu_ras_interrupt_umc_handler(obj, &entry);
+ else
+ dev_warn(obj->adev->dev,
+ "No RAS interrupt handler for non-UMC block with poison disabled.\n");
}
}
}
@@ -1647,9 +1667,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
}
int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
- struct ras_ih_if *info)
+ struct ras_common_if *head)
{
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
struct ras_ih_data *data;
if (!obj)
@@ -1669,24 +1689,27 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
}
int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
- struct ras_ih_if *info)
+ struct ras_common_if *head)
{
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
struct ras_ih_data *data;
+ struct amdgpu_ras_block_object *ras_obj;
if (!obj) {
/* in case we registe the IH before enable ras feature */
- obj = amdgpu_ras_create_obj(adev, &info->head);
+ obj = amdgpu_ras_create_obj(adev, head);
if (!obj)
return -EINVAL;
} else
get_obj(obj);
+ ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
+
data = &obj->ih_data;
/* add the callback.etc */
*data = (struct ras_ih_data) {
.inuse = 0,
- .cb = info->cb,
+ .cb = ras_obj->ras_cb,
.element_size = sizeof(struct amdgpu_iv_entry),
.rptr = 0,
.wptr = 0,
@@ -1715,10 +1738,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
struct ras_manager *obj, *tmp;
list_for_each_entry_safe(obj, tmp, &con->head, node) {
- struct ras_ih_if info = {
- .head = obj->head,
- };
- amdgpu_ras_interrupt_remove_handler(adev, &info);
+ amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
}
return 0;
@@ -1766,24 +1786,28 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
struct ras_query_if *info)
{
+ struct amdgpu_ras_block_object *block_obj;
/*
* Only two block need to query read/write
* RspStatus at current state
*/
- switch (info->head.block) {
- case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.ras_funcs &&
- adev->gfx.ras_funcs->query_ras_error_status)
- adev->gfx.ras_funcs->query_ras_error_status(adev);
- break;
- case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->query_ras_error_status)
- adev->mmhub.ras_funcs->query_ras_error_status(adev);
- break;
- default:
- break;
+ if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
+ (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
+ return;
+
+ block_obj = amdgpu_ras_get_ras_block(adev,
+ info->head.block,
+ info->head.sub_block_index);
+
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ get_ras_block_str(&info->head));
+ return;
}
+
+ if (block_obj->hw_ops->query_ras_error_status)
+ block_obj->hw_ops->query_ras_error_status(adev);
+
}
static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
@@ -1897,7 +1921,6 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
if (!bps) {
- kfree(bps);
return -ENOMEM;
}
@@ -2118,6 +2141,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2141,8 +2165,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
if (ret)
goto free;
- if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
- adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
+ amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
+
+ if (con->update_channel_flag == true) {
+ amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
+ con->update_channel_flag = false;
+ }
}
#ifdef CONFIG_X86_MCE_AMD
@@ -2250,6 +2278,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
dev_info(adev->dev, "SRAM ECC is active.\n");
adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1 << AMDGPU_RAS_BLOCK__DF);
+
+ if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
+ adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+ else
+ adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
} else {
dev_info(adev->dev, "SRAM ECC is not presented.\n");
}
@@ -2336,6 +2371,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ con->update_channel_flag = false;
con->features = 0;
INIT_LIST_HEAD(&con->head);
/* Might need get this flag from vbios. */
@@ -2348,24 +2384,27 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
case CHIP_VEGA20:
case CHIP_ARCTURUS:
case CHIP_ALDEBARAN:
- if (!adev->gmc.xgmi.connected_to_cpu)
- adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ adev->nbio.ras = &nbio_v7_4_ras;
+ amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block);
+ adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm;
+ }
break;
default:
/* nbio ras is not available */
break;
}
- if (adev->nbio.ras_funcs &&
- adev->nbio.ras_funcs->init_ras_controller_interrupt) {
- r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
+ if (adev->nbio.ras &&
+ adev->nbio.ras->init_ras_controller_interrupt) {
+ r = adev->nbio.ras->init_ras_controller_interrupt(adev);
if (r)
goto release_con;
}
- if (adev->nbio.ras_funcs &&
- adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
- r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
+ if (adev->nbio.ras &&
+ adev->nbio.ras->init_ras_err_event_athub_interrupt) {
+ r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
if (r)
goto release_con;
}
@@ -2377,12 +2416,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
}
else if (adev->df.funcs &&
adev->df.funcs->query_ras_poison_mode &&
- adev->umc.ras_funcs &&
- adev->umc.ras_funcs->query_ras_poison_mode) {
+ adev->umc.ras &&
+ adev->umc.ras->query_ras_poison_mode) {
df_poison =
adev->df.funcs->query_ras_poison_mode(adev);
umc_poison =
- adev->umc.ras_funcs->query_ras_poison_mode(adev);
+ adev->umc.ras->query_ras_poison_mode(adev);
/* Only poison is set in both DF and UMC, we can support it */
if (df_poison && umc_poison)
con->poison_supported = true;
@@ -2445,11 +2484,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
}
/* helper function to handle common stuff in ip late init phase */
-int amdgpu_ras_late_init(struct amdgpu_device *adev,
- struct ras_common_if *ras_block,
- struct ras_fs_if *fs_info,
- struct ras_ih_if *ih_info)
+int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
{
+ struct amdgpu_ras_block_object *ras_obj = NULL;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
unsigned long ue_count, ce_count;
int r;
@@ -2477,15 +2515,16 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;
- if (ih_info->cb) {
- r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
+ ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
+ if (ras_obj->ras_cb) {
+ r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
if (r)
- goto interrupt;
+ goto cleanup;
}
- r = amdgpu_ras_sysfs_create(adev, fs_info);
+ r = amdgpu_ras_sysfs_create(adev, ras_block);
if (r)
- goto sysfs;
+ goto interrupt;
/* Those are the cached values at init.
*/
@@ -2495,27 +2534,40 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
}
return 0;
-cleanup:
- amdgpu_ras_sysfs_remove(adev, ras_block);
-sysfs:
- if (ih_info->cb)
- amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+
interrupt:
+ if (ras_obj->ras_cb)
+ amdgpu_ras_interrupt_remove_handler(adev, ras_block);
+cleanup:
amdgpu_ras_feature_enable(adev, ras_block, 0);
return r;
}
+static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ return amdgpu_ras_block_late_init(adev, ras_block);
+}
+
/* helper function to remove ras fs node and interrupt handler */
-void amdgpu_ras_late_fini(struct amdgpu_device *adev,
- struct ras_common_if *ras_block,
- struct ras_ih_if *ih_info)
+void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
{
- if (!ras_block || !ih_info)
+ struct amdgpu_ras_block_object *ras_obj;
+ if (!ras_block)
return;
amdgpu_ras_sysfs_remove(adev, ras_block);
- if (ih_info->cb)
- amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+
+ ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
+ if (ras_obj->ras_cb)
+ amdgpu_ras_interrupt_remove_handler(adev, ras_block);
+}
+
+static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ return amdgpu_ras_block_late_fini(adev, ras_block);
}
/* do some init work after IP late init as dependence.
@@ -2568,6 +2620,33 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
amdgpu_ras_disable_all_features(adev, 1);
}
+int amdgpu_ras_late_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_block_list *node, *tmp;
+ struct amdgpu_ras_block_object *obj;
+ int r;
+
+ list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+ if (!node->ras_obj) {
+ dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
+ continue;
+ }
+
+ obj = node->ras_obj;
+ if (obj->ras_late_init) {
+ r = obj->ras_late_init(adev, &obj->ras_comm);
+ if (r) {
+ dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
+ obj->ras_comm.name, r);
+ return r;
+ }
+ } else
+ amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
+ }
+
+ return 0;
+}
+
/* do some fini work before IP fini as dependence */
int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
{
@@ -2585,11 +2664,28 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
int amdgpu_ras_fini(struct amdgpu_device *adev)
{
+ struct amdgpu_ras_block_list *ras_node, *tmp;
+ struct amdgpu_ras_block_object *obj = NULL;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!adev->ras_enabled || !con)
return 0;
+ list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
+ if (ras_node->ras_obj) {
+ obj = ras_node->ras_obj;
+ if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
+ obj->ras_fini)
+ obj->ras_fini(adev, &obj->ras_comm);
+ else
+ amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+ }
+
+ /* Clear ras blocks from ras_list and free ras block list node */
+ list_del(&ras_node->node);
+ kfree(ras_node);
+ }
+
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
@@ -2717,8 +2813,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
umc_inst, ch_inst);
- memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
-
/*
* Translate UMC channel address to Physical address
*/
@@ -2730,16 +2824,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
ADDR_OF_256B_BLOCK(channel_index) |
OFFSET_IN_256B_BLOCK(m->addr);
- err_rec.address = m->addr;
- err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
- err_rec.ts = (uint64_t)ktime_get_real_seconds();
- err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
- err_rec.cu = 0;
- err_rec.mem_channel = channel_index;
- err_rec.mcumc_id = umc_inst;
-
+ memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
err_data.err_addr = &err_rec;
- err_data.err_addr_cnt = 1;
+ amdgpu_umc_fill_error_record(&err_data, m->addr,
+ retired_page, channel_index, umc_inst);
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
@@ -2777,3 +2865,63 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
}
}
#endif
+
+struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
+{
+ if (!adev)
+ return NULL;
+
+ return adev->psp.ras_context.ras;
+}
+
+int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
+{
+ if (!adev)
+ return -EINVAL;
+
+ adev->psp.ras_context.ras = ras_con;
+ return 0;
+}
+
+/* check if ras is supported on block, say, sdma, gfx */
+int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+ unsigned int block)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ if (block >= AMDGPU_RAS_BLOCK_COUNT)
+ return 0;
+ return ras && (adev->ras_enabled & (1 << block));
+}
+
+int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+ schedule_work(&ras->recovery_work);
+ return 0;
+}
+
+
+/* Register each ip ras block into amdgpu ras */
+int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
+ struct amdgpu_ras_block_object *ras_block_obj)
+{
+ struct amdgpu_ras_block_list *ras_node;
+ if (!adev || !ras_block_obj)
+ return -EINVAL;
+
+ if (!amdgpu_ras_asic_supported(adev))
+ return 0;
+
+ ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
+ if (!ras_node)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&ras_node->node);
+ ras_node->ras_obj = ras_block_obj;
+ list_add_tail(&ras_node->node, &adev->ras_list);
+
+ return 0;
+}