diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 143 |
1 files changed, 72 insertions, 71 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e8b8f28c2f72..1b108d03e785 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -32,7 +32,8 @@ #include "wafl/wafl2_4_0_0_smn.h" #include "wafl/wafl2_4_0_0_sh_mask.h" -#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 +#include "amdgpu_reset.h" + #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 @@ -67,17 +68,6 @@ static const int wafl_pcs_err_status_reg_arct[] = { smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, }; -static const int xgmi23_pcs_err_status_reg_aldebaran[] = { - smnPCS_XGMI23_PCS_ERROR_STATUS, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000, - smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000 -}; - static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { smnPCS_XGMI3X16_PCS_ERROR_STATUS, smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, @@ -227,6 +217,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj) struct amdgpu_hive_info *hive = container_of( kobj, struct amdgpu_hive_info, kobj); + amdgpu_reset_put_reset_domain(hive->reset_domain); + hive->reset_domain = NULL; + mutex_destroy(&hive->hive_lock); kfree(hive); } @@ -398,15 +391,35 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } + /** + * Avoid recreating reset domain when hive is reconstructed for the case + * of reset the devices in the XGMI hive during probe for SRIOV + * See https://www.spinics.net/lists/amd-gfx/msg58836.html + */ + if (adev->reset_domain->type != XGMI_HIVE) { + hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); + if (!hive->reset_domain) { + dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); + ret = -ENOMEM; + kobject_put(&hive->kobj); + kfree(hive); + hive = NULL; + goto pro_end; + } + } else { + amdgpu_reset_get_reset_domain(adev->reset_domain); + hive->reset_domain = adev->reset_domain; + } + hive->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); mutex_init(&hive->hive_lock); - atomic_set(&hive->in_reset, 0); atomic_set(&hive->number_devices, 0); task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + /* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot. @@ -732,53 +745,15 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return psp_xgmi_terminate(&adev->psp); } -static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) +static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { - int r; - struct ras_ih_if ih_info = { - .cb = NULL, - }; - struct ras_fs_if fs_info = { - .sysfs_name = "xgmi_wafl_err_count", - }; - if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) return 0; - adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev); - - if (!adev->gmc.xgmi.ras_if) { - adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); - if (!adev->gmc.xgmi.ras_if) - return -ENOMEM; - adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL; - adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; - adev->gmc.xgmi.ras_if->sub_block_index = 0; - } - ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if; - r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if, - &fs_info, &ih_info); - if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) { - kfree(adev->gmc.xgmi.ras_if); - adev->gmc.xgmi.ras_if = NULL; - } - - return r; -} + adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); -static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) -{ - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && - adev->gmc.xgmi.ras_if) { - struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if; - struct ras_ih_if ih_info = { - .cb = NULL, - }; - - amdgpu_ras_late_fini(adev, ras_if, &ih_info); - kfree(ras_if); - } + return amdgpu_ras_block_late_init(adev, ras_block); } uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, @@ -810,9 +785,6 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) xgmi_pcs_err_status_reg_vg20[i]); break; case CHIP_ALDEBARAN: - for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) - pcs_clear_status(adev, - xgmi23_pcs_err_status_reg_aldebaran[i]); for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) pcs_clear_status(adev, xgmi3x16_pcs_err_status_reg_aldebaran[i]); @@ -865,7 +837,7 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, return 0; } -static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, +static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; @@ -874,7 +846,7 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, uint32_t ue_cnt = 0, ce_cnt = 0; if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) - return -EINVAL; + return ; err_data->ue_count = 0; err_data->ce_count = 0; @@ -913,13 +885,6 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, } break; case CHIP_ALDEBARAN: - /* check xgmi23 pcs error */ - for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) { - data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]); - if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, true); - } /* check xgmi3x16 pcs error */ for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); @@ -940,17 +905,53 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } - adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev); + adev->gmc.xgmi.ras->ras_block.hw_ops->reset_ras_error_count(adev); err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; +} - return 0; +/* Trigger XGMI/WAFL error */ +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, void *inject_if) +{ + int ret = 0; + struct ta_ras_trigger_error_input *block_info = + (struct ta_ras_trigger_error_input *)inject_if; + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) + dev_warn(adev->dev, "Failed to disallow df cstate"); + + if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) + dev_warn(adev->dev, "Failed to disallow XGMI power down"); + + ret = psp_ras_trigger_error(&adev->psp, block_info); + + if (amdgpu_ras_intr_triggered()) + return ret; + + if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) + dev_warn(adev->dev, "Failed to allow XGMI power down"); + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) + dev_warn(adev->dev, "Failed to allow df cstate"); + + return ret; } -const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = { - .ras_late_init = amdgpu_xgmi_ras_late_init, - .ras_fini = amdgpu_xgmi_ras_fini, +struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, + .ras_error_inject = amdgpu_ras_error_inject_xgmi, +}; + +struct amdgpu_xgmi_ras xgmi_ras = { + .ras_block = { + .ras_comm = { + .name = "xgmi_wafl", + .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + }, + .hw_ops = &xgmi_ras_hw_ops, + .ras_late_init = amdgpu_xgmi_ras_late_init, + }, }; |