From a0a2f7bb220945e369de77ea004d96236e9463a6 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Fri, 27 Aug 2021 19:14:36 +0800 Subject: drm/amd/amdgpu: add mpio to ras block Add MPIO to RAS block Signed-off-by: Candice Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 96a8fd0ca1df..77140821dc11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -61,6 +61,7 @@ const char *ras_block_string[] = { "mp0", "mp1", "fuse", + "mpio", }; #define ras_err_str(i) (ras_error_string[ffs(i)]) -- cgit From 3771449bc80fa494c15f366ce1fa9e3168332b6a Mon Sep 17 00:00:00 2001 From: John Clements Date: Thu, 9 Sep 2021 16:05:12 +0800 Subject: drm/amdgpu: Update RAS trigger error block support Added trigger error support for MP0/MP1/MPIO blocks Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 77140821dc11..b5332db4d287 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1028,6 +1028,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__PCIE_BIF: + case AMDGPU_RAS_BLOCK__MP0: + case AMDGPU_RAS_BLOCK__MP1: + case AMDGPU_RAS_BLOCK__MPIO: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: -- cgit From 640ae42efb828be69a9ee6ac88fb3d5a3e678ddf Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 22 Sep 2021 14:04:52 +0800 Subject: drm/amdgpu: Updated RAS infrastructure Update RAS infrastructure to support RAS query for MCA subblocks Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 19 +++-- drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 9 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 4 +- drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 11 ++- 7 files changed, 146 insertions(+), 52 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index a2d3dbbf7d25..ce538f4819f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) { - uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); + uint64_t mc_status = RREG64_PCIE(mc_status_addr); if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) @@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) { - uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); + uint64_t mc_status = RREG64_PCIE(mc_status_addr); if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || @@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr) { - WREG64_PCIE(mc_status_addr * 4, 0x0ULL); + WREG64_PCIE(mc_status_addr, 0x0ULL); } void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, @@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev, if (!mca_dev->ras_if) return -ENOMEM; mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block; + mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block; mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; - mca_dev->ras_if->sub_block_index = 0; } ih_info.head = fs_info.head = *mca_dev->ras_if; r = amdgpu_ras_late_init(adev, mca_dev->ras_if, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index f860f2f0e296..c74bc7177066 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs { void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); uint32_t ras_block; + uint32_t ras_sub_block; const char* sysfs_name; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b5332db4d287..912ea1f9fd04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -61,9 +61,30 @@ const char *ras_block_string[] = { "mp0", "mp1", "fuse", - "mpio", + "mca", }; +const char *ras_mca_block_string[] = { + "mca_mp0", + "mca_mp1", + "mca_mpio", + "mca_iohc", +}; + +const char *get_ras_block_str(struct ras_common_if *ras_block) +{ + if (!ras_block) + return "NULL"; + + if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) + return "OUT OF RANGE"; + + if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) + return ras_mca_block_string[ras_block->sub_block_index]; + + return ras_block_string[ras_block->block]; +} + #define ras_err_str(i) (ras_error_string[ffs(i)]) #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) @@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { *block_id = i; - if (strcmp(name, ras_block_str(i)) == 0) + if (strcmp(name, ras_block_string[i]) == 0) return 0; } return -EINVAL; @@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - if (obj->adev->asic_type == CHIP_ALDEBARAN) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) DRM_WARN("Failed to reset error counter and error status"); @@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj) if (obj && (--obj->use == 0)) list_del(&obj->node); if (obj && (obj->use < 0)) - DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block)); + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); } /* make one obj and return it. */ @@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, if (head->block >= AMDGPU_RAS_BLOCK_COUNT) return NULL; - obj = &con->objs[head->block]; + if (head->block == AMDGPU_RAS_BLOCK__MCA) { + if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) + return NULL; + + obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; + } else + obj = &con->objs[head->block]; + /* already exist. return obj? */ if (alive_obj(obj)) return NULL; @@ -574,19 +601,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, if (head->block >= AMDGPU_RAS_BLOCK_COUNT) return NULL; - obj = &con->objs[head->block]; + if (head->block == AMDGPU_RAS_BLOCK__MCA) { + if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) + return NULL; + + obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; + } else + obj = &con->objs[head->block]; - if (alive_obj(obj)) { - WARN_ON(head->block != obj->head.block); + if (alive_obj(obj)) return obj; - } } else { - for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { + for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { obj = &con->objs[i]; - if (alive_obj(obj)) { - WARN_ON(i != obj->head.block); + if (alive_obj(obj)) return obj; - } } } @@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, */ if (!amdgpu_ras_is_feature_allowed(adev, head)) return 0; - if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) - return 0; if (enable) { if (!obj) { @@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, /* Do not enable if it is not allowed. */ WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); - /* Are we alerady in that state we are going to set? */ - if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) { - ret = 0; - goto out; - } if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { dev_err(adev->dev, "ras %s %s failed %d\n", enable ? "enable":"disable", - ras_block_str(head->block), + get_ras_block_str(head), ret); goto out; } @@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (!ret) dev_info(adev->dev, "RAS INFO: %s setup object\n", - ras_block_str(head->block)); + get_ras_block_str(head)); } } else { /* setup the object then issue a ras TA disable cmd.*/ @@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, bool bypass) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; int i; - const enum amdgpu_ras_error_type default_ras_type = - AMDGPU_RAS_ERROR__NONE; + const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; - for (i = 0; i < ras_block_count; i++) { + for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { struct ras_common_if head = { .block = i, .type = default_ras_type, .sub_block_index = 0, }; + + if (i == AMDGPU_RAS_BLOCK__MCA) + continue; + + if (bypass) { + /* + * bypass psp. vbios enable ras for us. + * so just create the obj + */ + if (__amdgpu_ras_feature_enable(adev, &head, 1)) + break; + } else { + if (amdgpu_ras_feature_enable(adev, &head, 1)) + break; + } + } + + for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__MCA, + .type = default_ras_type, + .sub_block_index = i, + }; + if (bypass) { /* * bypass psp. vbios enable ras for us. @@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, } /* feature ctl end */ + +void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_err_data *err_data) +{ + switch (ras_block->sub_block_index) { + case AMDGPU_RAS_MCA_BLOCK__MP0: + if (adev->mca.mp0.ras_funcs && + adev->mca.mp0.ras_funcs->query_ras_error_count) + adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data); + break; + case AMDGPU_RAS_MCA_BLOCK__MP1: + if (adev->mca.mp1.ras_funcs && + adev->mca.mp1.ras_funcs->query_ras_error_count) + adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data); + break; + case AMDGPU_RAS_MCA_BLOCK__MPIO: + if (adev->mca.mpio.ras_funcs && + adev->mca.mpio.ras_funcs->query_ras_error_count) + adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data); + break; + default: + break; + } +} + /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) @@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->hdp.ras_funcs->query_ras_error_count) adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__MCA: + amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data); + break; default: break; } @@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_die_id(adev), obj->err_data.ce_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } else { dev_info(adev->dev, "%ld correctable hardware errors " "detected in %s block, no user " "action is needed.\n", obj->err_data.ce_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } } if (err_data.ue_count) { @@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_die_id(adev), obj->err_data.ue_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } else { dev_info(adev->dev, "%ld uncorrectable hardware errors " "detected in %s block\n", obj->err_data.ue_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } } @@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__PCIE_BIF: - case AMDGPU_RAS_BLOCK__MP0: - case AMDGPU_RAS_BLOCK__MP1: - case AMDGPU_RAS_BLOCK__MPIO: + case AMDGPU_RAS_BLOCK__MCA: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: @@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, break; default: dev_info(adev->dev, "%s error injection is not supported yet\n", - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); ret = -EINVAL; } if (ret) dev_err(adev->dev, "ras inject %s failed %d\n", - ras_block_str(info->head.block), ret); + get_ras_block_str(&info->head), ret); return ret; } @@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) if (amdgpu_ras_is_supported(adev, obj->head.block) && (obj->attr_inuse == 1)) { sprintf(fs_info.debugfs_name, "%s_err_inject", - ras_block_str(obj->head.block)); + get_ras_block_str(&obj->head)); fs_info.head = obj->head; amdgpu_ras_debugfs_create(adev, &fs_info, dir); } @@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) return 0; con = kmalloc(sizeof(struct amdgpu_ras) + - sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, + sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + + sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, GFP_KERNEL|__GFP_ZERO); if (!con) return -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 1670467c2054..ec42e9873aaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -49,15 +49,22 @@ enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__MP0, AMDGPU_RAS_BLOCK__MP1, AMDGPU_RAS_BLOCK__FUSE, - AMDGPU_RAS_BLOCK__MPIO, + AMDGPU_RAS_BLOCK__MCA, AMDGPU_RAS_BLOCK__LAST }; -extern const char *ras_block_string[]; +enum amdgpu_ras_mca_block { + AMDGPU_RAS_MCA_BLOCK__MP0 = 0, + AMDGPU_RAS_MCA_BLOCK__MP1, + AMDGPU_RAS_MCA_BLOCK__MPIO, + AMDGPU_RAS_MCA_BLOCK__IOHC, + + AMDGPU_RAS_MCA_BLOCK__LAST +}; -#define ras_block_str(i) (ras_block_string[i]) #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST +#define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) enum amdgpu_ras_gfx_subblock { @@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { return TA_RAS_BLOCK__MP1; case AMDGPU_RAS_BLOCK__FUSE: return TA_RAS_BLOCK__FUSE; - case AMDGPU_RAS_BLOCK__MPIO: - return TA_RAS_BLOCK__MPIO; + case AMDGPU_RAS_BLOCK__MCA: + return TA_RAS_BLOCK__MCA; default: WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block); return TA_RAS_BLOCK__UMC; @@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev); int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); +const char *get_ras_block_str(struct ras_common_if *ras_block); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c index 058b65730a84..8f7107d392af 100644 --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c @@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = { .ras_fini = mca_v3_0_mp0_ras_fini, .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MP0, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0, .sysfs_name = "mp0_err_count", }; @@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = { .ras_fini = mca_v3_0_mp1_ras_fini, .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MP1, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1, .sysfs_name = "mp1_err_count", }; @@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = { .ras_fini = mca_v3_0_mpio_ras_fini, .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MPIO, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO, .sysfs_name = "mpio_err_count", }; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 0a9fc19b1be0..91b3afa946f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device "errors detected in %s block, " "no user action is needed.\n", obj->err_data.ce_count, - ras_block_str(adev->nbio.ras_if->block)); + get_ras_block_str(adev->nbio.ras_if)); if (err_data.ue_count) dev_info(adev->dev, "%ld uncorrectable hardware " "errors detected in %s block\n", obj->err_data.ue_count, - ras_block_str(adev->nbio.ras_if->block)); + get_ras_block_str(adev->nbio.ras_if)); } dev_info(adev->dev, "RAS controller interrupt triggered " diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index 532260fd64db..82d956d15b54 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -73,10 +73,19 @@ enum ta_ras_block { TA_RAS_BLOCK__MP0, TA_RAS_BLOCK__MP1, TA_RAS_BLOCK__FUSE, - TA_RAS_BLOCK__MPIO, + TA_RAS_BLOCK__MCA, TA_NUM_BLOCK_MAX }; +enum ta_ras_mca_block +{ + TA_RAS_MCA_BLOCK__MP0 = 0, + TA_RAS_MCA_BLOCK__MP1 = 1, + TA_RAS_MCA_BLOCK__MPIO = 2, + TA_RAS_MCA_BLOCK__IOHC = 3, + TA_MCA_NUM_BLOCK_MAX +}; + enum ta_ras_error_type { TA_RAS_ERROR__NONE = 0, TA_RAS_ERROR__PARITY = 1, -- cgit From 9080a18fc554cea0858fae6692a7003c5f0365fc Mon Sep 17 00:00:00 2001 From: Candice Li Date: Wed, 15 Sep 2021 15:14:18 +0800 Subject: drm/amdgpu: Remove all code paths under the EAGAIN path in RAS late init All code paths under the EAGAIN path in RAS late init are unused. Signed-off-by: Candice Li Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 +-------------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 --- 2 files changed, 1 insertion(+), 35 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 912ea1f9fd04..e1c34eef76b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2131,19 +2131,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) } /* recovery end */ -/* return 0 if ras will reset gpu and repost.*/ -int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, - unsigned int block) -{ - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - - if (!ras) - return -EINVAL; - - ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; - return 0; -} - static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { return adev->asic_type == CHIP_VEGA10 || @@ -2382,12 +2369,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); if (r) { - if (r == -EAGAIN) { - /* request gpu reset. will run again */ - amdgpu_ras_request_reset_on_boot(adev, - ras_block->block); - return 0; - } else if (adev->in_suspend || amdgpu_in_reset(adev)) { + if (adev->in_suspend || amdgpu_in_reset(adev)) { /* in resume phase, if fail to enable ras, * clean up all ras fs nodes, and disable ras */ goto cleanup; @@ -2479,19 +2461,6 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) } } } - - if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { - con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; - /* setup ras obj state as disabled. - * for init_by_vbios case. - * if we want to enable ras, just enable it in a normal way. - * If we want do disable it, need setup ras obj as enabled, - * then issue another TA disable cmd. - * See feature_enable_on_boot - */ - amdgpu_ras_disable_all_features(adev, 1); - amdgpu_ras_reset_gpu(adev); - } } void amdgpu_ras_suspend(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ec42e9873aaa..37b3c40272b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -32,7 +32,6 @@ #include "amdgpu_ras_eeprom.h" #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET (0x1 << 1) enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -495,8 +494,6 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, } int amdgpu_ras_recovery_init(struct amdgpu_device *adev); -int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, - unsigned int block); void amdgpu_ras_resume(struct amdgpu_device *adev); void amdgpu_ras_suspend(struct amdgpu_device *adev); -- cgit From e43488493cbb46e862f83c66887f3e6cb854c6f0 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 17 Sep 2021 18:24:09 +0800 Subject: drm/amdgpu: set poison supported flag for RAS (v2) Add RAS poison supported flag and tell PSP RAS TA about the info. v2: rename poison mode to poison supported, we can also disable poison mode even we support it. print value of poison supported if ras feature enablement fails. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 ++++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++++ 3 files changed, 37 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 17d09771be3e..f17a1036f46e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1444,9 +1444,9 @@ static int psp_ras_initialize(struct psp_context *psp) ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); - if (psp->adev->gmc.xgmi.connected_to_cpu) + if (amdgpu_ras_is_poison_mode_supported(adev)) ras_cmd->ras_in_message.init_flags.poison_mode_en = 1; - else + if (!adev->gmc.xgmi.connected_to_cpu) ras_cmd->ras_in_message.init_flags.dgpu_mode = 1; ret = psp_ras_load(psp); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e1c34eef76b7..4c547eee5702 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -710,10 +710,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { - dev_err(adev->dev, "ras %s %s failed %d\n", + dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", enable ? "enable":"disable", get_ras_block_str(head), - ret); + amdgpu_ras_is_poison_mode_supported(adev), ret); goto out; } } @@ -2238,6 +2238,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int r; + bool df_poison, umc_poison; if (con) return 0; @@ -2308,6 +2309,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + /* Init poison supported flag, the default value is false */ + if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras_funcs->query_ras_poison_mode(adev); + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; @@ -2351,6 +2369,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, return 0; } +bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return false; + + return con->poison_supported; +} + /* helper function to handle common stuff in ip late init phase */ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 37b3c40272b4..e36f4de9fa55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -351,6 +351,9 @@ struct amdgpu_ras { /* disable ras error count harvest in recovery */ bool disable_ras_err_cnt_harvest; + /* is poison mode supported */ + bool poison_supported; + /* RAS count errors delayed work */ struct delayed_work ras_counte_delay_work; atomic_t ras_ue_count; @@ -646,4 +649,6 @@ int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); const char *get_ras_block_str(struct ras_common_if *ras_block); +bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); + #endif -- cgit From f524dd54a78924b59acd8f251788889129b3a2e9 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 17 Sep 2021 18:40:57 +0800 Subject: drm/amdgpu: skip umc ras irq handling in poison mode (v2) In ras poison mode, umc uncorrectable error will be ignored until the corrupted data consumed by another ras module (such as gfx, sdma). v2: update the debug message and replace dev_warn with dev_info. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4c547eee5702..8243f79a7c4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1544,22 +1544,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) data->rptr = (data->aligned_element_size + data->rptr) % data->ring_size; - /* Let IP handle its data, maybe we need get the output - * from the callback to udpate the error type/count, etc - */ if (data->cb) { - ret = data->cb(obj->adev, &err_data, &entry); - /* ue will trigger an interrupt, and in that case - * we need do a reset to recovery the whole system. - * But leave IP do that recovery, here we just dispatch - * the error. - */ - if (ret == AMDGPU_RAS_SUCCESS) { - /* these counts could be left as 0 if - * some blocks do not count error number + if (amdgpu_ras_is_poison_mode_supported(obj->adev) && + obj->head.block == AMDGPU_RAS_BLOCK__UMC) + dev_info(obj->adev->dev, + "Poison is created, no user action is needed.\n"); + else { + /* Let IP handle its data, maybe we need get the output + * from the callback to udpate the error type/count, etc + */ + ret = data->cb(obj->adev, &err_data, &entry); + /* ue will trigger an interrupt, and in that case + * we need do a reset to recovery the whole system. + * But leave IP do that recovery, here we just dispatch + * the error. */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; + if (ret == AMDGPU_RAS_SUCCESS) { + /* these counts could be left as 0 if + * some blocks do not count error number + */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } } } } -- cgit From eb601e61d3492d809cb82a19560a6c31c36fd48a Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 29 Sep 2021 15:06:21 +0800 Subject: drm/amdgpu: resolve RAS query bug clear error count when persistant harvesting is not enabled Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8243f79a7c4e..0e82e0a47d53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -995,6 +995,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } } + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + amdgpu_ras_reset_error_status(adev, info->head.block); + return 0; } -- cgit From 12b2cab79017ebe598c74493ac1cfc5934d3ccc2 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Wed, 22 Sep 2021 14:49:43 -0400 Subject: drm/amdgpu: Register MCE notifier for Aldebaran RAS On Aldebaran, GPU driver will handle bad page retirement for GPU memory even though UMC is host managed. As a result, register a bad page retirement handler on the mce notifier chain to retire bad pages on Aldebaran. Signed-off-by: Mukul Joshi Reviewed-by: Yazen Ghannam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 141 ++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0e82e0a47d53..e8875351967e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -35,7 +35,11 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "atom.h" +#ifdef CONFIG_X86_MCE_AMD +#include +static bool notifier_registered; +#endif static const char *RAS_FS_NAME = "ras"; const char *ras_error_string[] = { @@ -107,6 +111,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); +#ifdef CONFIG_X86_MCE_AMD +static void amdgpu_register_bad_pages_mca_notifier(void); +#endif void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) { @@ -2098,6 +2105,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); } +#ifdef CONFIG_X86_MCE_AMD + if ((adev->asic_type == CHIP_ALDEBARAN) && + (adev->gmc.xgmi.connected_to_cpu)) + amdgpu_register_bad_pages_mca_notifier(); +#endif return 0; free: @@ -2589,3 +2601,132 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) kfree(con); } } + +#ifdef CONFIG_X86_MCE_AMD +static struct amdgpu_device *find_adev(uint32_t node_id) +{ + struct amdgpu_gpu_instance *gpu_instance; + int i; + struct amdgpu_device *adev = NULL; + + mutex_lock(&mgpu_info.mutex); + + for (i = 0; i < mgpu_info.num_gpu; i++) { + gpu_instance = &(mgpu_info.gpu_ins[i]); + adev = gpu_instance->adev; + + if (adev->gmc.xgmi.connected_to_cpu && + adev->gmc.xgmi.physical_node_id == node_id) + break; + adev = NULL; + } + + mutex_unlock(&mgpu_info.mutex); + + return adev; +} + +#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) +#define GET_UMC_INST(m) (((m) >> 21) & 0x7) +#define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) +#define GPU_ID_OFFSET 8 + +static int amdgpu_bad_page_notifier(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + struct amdgpu_device *adev = NULL; + uint32_t gpu_id = 0; + uint32_t umc_inst = 0; + uint32_t ch_inst, channel_index = 0; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct eeprom_table_record err_rec; + uint64_t retired_page; + + /* + * If the error was generated in UMC_V2, which belongs to GPU UMCs, + * and error occurred in DramECC (Extended error code = 0) then only + * process the error, else bail out. + */ + if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) && + (XEC(m->status, 0x3f) == 0x0))) + return NOTIFY_DONE; + + /* + * If it is correctable error, return. + */ + if (mce_is_correctable(m)) + return NOTIFY_OK; + + /* + * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. + */ + gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; + + adev = find_adev(gpu_id); + if (!adev) { + DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, + gpu_id); + return NOTIFY_DONE; + } + + /* + * If it is uncorrectable error, then find out UMC instance and + * channel index. + */ + umc_inst = GET_UMC_INST(m->ipid); + ch_inst = GET_CHAN_INDEX(m->ipid); + + dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", + umc_inst, ch_inst); + + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); + + /* + * Translate UMC channel address to Physical address + */ + channel_index = + adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + + ch_inst]; + + retired_page = ADDR_OF_8KB_BLOCK(m->addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(m->addr); + + err_rec.address = m->addr; + err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + err_rec.ts = (uint64_t)ktime_get_real_seconds(); + err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + err_rec.cu = 0; + err_rec.mem_channel = channel_index; + err_rec.mcumc_id = umc_inst; + + err_data.err_addr = &err_rec; + err_data.err_addr_cnt = 1; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + + return NOTIFY_OK; +} + +static struct notifier_block amdgpu_bad_page_nb = { + .notifier_call = amdgpu_bad_page_notifier, + .priority = MCE_PRIO_UC, +}; + +static void amdgpu_register_bad_pages_mca_notifier(void) +{ + /* + * Register the x86 notifier only once + * with MCE subsystem. + */ + if (notifier_registered == false) { + mce_register_decode_chain(&amdgpu_bad_page_nb); + notifier_registered = true; + } +} +#endif -- cgit From 91a1a52d03aa0f1f2b51c7df8a7bf437e906e29f Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Mon, 20 Sep 2021 20:48:23 -0400 Subject: drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran During mode2 reset, the GPU is temporarily removed from the mgpu_info list. As a result, page retirement fails because it cannot find the GPU in the GPU list. To fix this, create our own list of GPUs that support MCE notifier based page retirement and use that list to check if the UMC error occurred on a GPU that supports MCE notifier based page retirement. Signed-off-by: Mukul Joshi Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e8875351967e..08133de21fdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -112,7 +112,12 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); #ifdef CONFIG_X86_MCE_AMD -static void amdgpu_register_bad_pages_mca_notifier(void); +static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); +struct mce_notifier_adev_list { + struct amdgpu_device *devs[MAX_GPU_INSTANCE]; + int num_gpu; +}; +static struct mce_notifier_adev_list mce_adev_list; #endif void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) @@ -2108,7 +2113,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) #ifdef CONFIG_X86_MCE_AMD if ((adev->asic_type == CHIP_ALDEBARAN) && (adev->gmc.xgmi.connected_to_cpu)) - amdgpu_register_bad_pages_mca_notifier(); + amdgpu_register_bad_pages_mca_notifier(adev); #endif return 0; @@ -2605,24 +2610,18 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) #ifdef CONFIG_X86_MCE_AMD static struct amdgpu_device *find_adev(uint32_t node_id) { - struct amdgpu_gpu_instance *gpu_instance; int i; struct amdgpu_device *adev = NULL; - mutex_lock(&mgpu_info.mutex); - - for (i = 0; i < mgpu_info.num_gpu; i++) { - gpu_instance = &(mgpu_info.gpu_ins[i]); - adev = gpu_instance->adev; + for (i = 0; i < mce_adev_list.num_gpu; i++) { + adev = mce_adev_list.devs[i]; - if (adev->gmc.xgmi.connected_to_cpu && + if (adev && adev->gmc.xgmi.connected_to_cpu && adev->gmc.xgmi.physical_node_id == node_id) break; adev = NULL; } - mutex_unlock(&mgpu_info.mutex); - return adev; } @@ -2718,8 +2717,18 @@ static struct notifier_block amdgpu_bad_page_nb = { .priority = MCE_PRIO_UC, }; -static void amdgpu_register_bad_pages_mca_notifier(void) +static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) { + /* + * Add the adev to the mce_adev_list. + * During mode2 reset, amdgpu device is temporarily + * removed from the mgpu_info list which can cause + * page retirement to fail. + * Use this list instead of mgpu_info to find the amdgpu + * device on which the UMC error was reported. + */ + mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; + /* * Register the x86 notifier only once * with MCE subsystem. -- cgit