diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 189 |
1 files changed, 133 insertions, 56 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 077404a9c935..d06beb884a16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -706,13 +706,23 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, return 0; } +static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev, + struct ras_common_if *head) +{ + if (amdgpu_ras_is_feature_allowed(adev, head) || + amdgpu_ras_is_poison_mode_supported(adev)) + return 1; + else + return 0; +} + /* wrapper of psp_ras_enable_features */ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); union ta_ras_cmd_input *info; - int ret; + int ret = 0; if (!con) return -EINVAL; @@ -736,7 +746,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, } /* Do not enable if it is not allowed. */ - WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); + if (enable && !amdgpu_ras_check_feature_allowed(adev, head)) + goto out; /* Only enable ras feature operation handle on host side */ if (head->block == AMDGPU_RAS_BLOCK__GFX && @@ -754,7 +765,6 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, /* setup the obj */ __amdgpu_ras_feature_enable(adev, head, enable); - ret = 0; out: if (head->block == AMDGPU_RAS_BLOCK__GFX) kfree(info); @@ -1087,6 +1097,10 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, info->head.block, info->head.sub_block_index); + /* inject on guest isn't allowed, return success directly */ + if (amdgpu_sriov_vf(adev)) + return 0; + if (!obj) return -EINVAL; @@ -1122,11 +1136,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, } /** - * amdgpu_ras_query_error_count -- Get error counts of all IPs + * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP + * @adev: pointer to AMD GPU device + * @ce_count: pointer to an integer to be set to the count of correctible errors. + * @ue_count: pointer to an integer to be set to the count of uncorrectible errors. + * @query_info: pointer to ras_query_if + * + * Return 0 for query success or do nothing, otherwise return an error + * on failures + */ +static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, + unsigned long *ce_count, + unsigned long *ue_count, + struct ras_query_if *query_info) +{ + int ret; + + if (!query_info) + /* do nothing if query_info is not specified */ + return 0; + + ret = amdgpu_ras_query_error_status(adev, query_info); + if (ret) + return ret; + + *ce_count += query_info->ce_count; + *ue_count += query_info->ue_count; + + /* some hardware/IP supports read to clear + * no need to explictly reset the err status after the query call */ + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { + if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) + dev_warn(adev->dev, + "Failed to reset error counter and error status\n"); + } + + return 0; +} + +/** + * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP * @adev: pointer to AMD GPU device * @ce_count: pointer to an integer to be set to the count of correctible errors. * @ue_count: pointer to an integer to be set to the count of uncorrectible * errors. + * @query_info: pointer to ras_query_if if the query request is only for + * specific ip block; if info is NULL, then the qurey request is for + * all the ip blocks that support query ras error counters/status * * If set, @ce_count or @ue_count, count and return the corresponding * error counts in those integer pointers. Return 0 if the device @@ -1134,11 +1191,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, */ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, unsigned long *ce_count, - unsigned long *ue_count) + unsigned long *ue_count, + struct ras_query_if *query_info) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; unsigned long ce, ue; + int ret; if (!adev->ras_enabled || !con) return -EOPNOTSUPP; @@ -1150,26 +1209,23 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, ce = 0; ue = 0; - list_for_each_entry(obj, &con->head, node) { - struct ras_query_if info = { - .head = obj->head, - }; - int res; - - res = amdgpu_ras_query_error_status(adev, &info); - if (res) - return res; + if (!query_info) { + /* query all the ip blocks that support ras query interface */ + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; - if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { - if (amdgpu_ras_reset_error_status(adev, info.head.block)) - dev_warn(adev->dev, "Failed to reset error counter and error status"); + ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); } - - ce += info.ce_count; - ue += info.ue_count; + } else { + /* query specific ip block */ + ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); } + if (ret) + return ret; + if (ce_count) *ce_count = ce; @@ -1267,7 +1323,7 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, struct amdgpu_ras *con = container_of(attr, struct amdgpu_ras, features_attr); - return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); + return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); } static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) @@ -2344,22 +2400,24 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - if (!amdgpu_sriov_vf(adev)) { + if (!amdgpu_sriov_vf(adev)) adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); - - if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || - adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) - adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | - 1 << AMDGPU_RAS_BLOCK__JPEG); - else - adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | - 1 << AMDGPU_RAS_BLOCK__JPEG); - } else { + else adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | 1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__GFX); - } + + /* VCN/JPEG RAS can be supported on both bare metal and + * SRIOV environment + */ + if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || + adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } @@ -2395,7 +2453,7 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) /* Cache new values. */ - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ue_count, ue_count); } @@ -2405,11 +2463,42 @@ Out: pm_runtime_put_autosuspend(dev->dev); } +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool df_poison, umc_poison; + + /* poison setting is useless on SRIOV guest */ + if (amdgpu_sriov_vf(adev) || !con) + return; + + /* Init poison supported flag, the default value is false */ + if (adev->gmc.xgmi.connected_to_cpu) { + /* enabled by default when GPU is connected to CPU */ + con->poison_supported = true; + } else if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras->query_ras_poison_mode(adev); + + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, + "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } +} + int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int r; - bool df_poison, umc_poison; if (con) return 0; @@ -2484,26 +2573,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } - /* Init poison supported flag, the default value is false */ - if (adev->gmc.xgmi.connected_to_cpu) { - /* enabled by default when GPU is connected to CPU */ - con->poison_supported = true; - } - else if (adev->df.funcs && - adev->df.funcs->query_ras_poison_mode && - adev->umc.ras && - adev->umc.ras->query_ras_poison_mode) { - df_poison = - adev->df.funcs->query_ras_poison_mode(adev); - umc_poison = - adev->umc.ras->query_ras_poison_mode(adev); - /* Only poison is set in both DF and UMC, we can support it */ - if (df_poison && umc_poison) - con->poison_supported = true; - else if (df_poison != umc_poison) - dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", - df_poison, umc_poison); - } + amdgpu_ras_query_poison_mode(adev); if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; @@ -2564,6 +2634,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, { struct amdgpu_ras_block_object *ras_obj = NULL; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_query_if *query_info; unsigned long ue_count, ce_count; int r; @@ -2605,11 +2676,17 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev, /* Those are the cached values at init. */ - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL); + if (!query_info) + return -ENOMEM; + memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); + + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { atomic_set(&con->ras_ce_count, ce_count); atomic_set(&con->ras_ue_count, ue_count); } + kfree(query_info); return 0; interrupt: |