diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 98 |
1 files changed, 66 insertions, 32 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 95b3327168ac..54d8a3e7e75c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -373,7 +373,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo if (lock) mutex_lock(&tmp->hive_lock); - tmp->pstate = -1; + tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; + tmp->hi_req_gpu = NULL; + /* + * hive pstate on boot is high in vega20 so we have to go to low + * pstate on after boot. + */ + tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; mutex_unlock(&xgmi_mutex); return tmp; @@ -383,50 +389,51 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) { int ret = 0; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); - struct amdgpu_device *tmp_adev; - bool update_hive_pstate = true; - bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; + struct amdgpu_device *request_adev = hive->hi_req_gpu ? + hive->hi_req_gpu : adev; + bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; + bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; - if (!hive) + /* fw bug so temporarily disable pstate switching */ + if (!hive || adev->asic_type == CHIP_VEGA20) return 0; mutex_lock(&hive->hive_lock); - if (hive->pstate == pstate) { - adev->pstate = is_high_pstate ? pstate : adev->pstate; + if (is_hi_req) + hive->hi_req_count++; + else + hive->hi_req_count--; + + /* + * Vega20 only needs single peer to request pstate high for the hive to + * go high but all peers must request pstate low for the hive to go low + */ + if (hive->pstate == pstate || + (!is_hi_req && hive->hi_req_count && !init_low)) goto out; - } - dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); + dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); - ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); + ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); if (ret) { - dev_err(adev->dev, + dev_err(request_adev->dev, "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", - adev->gmc.xgmi.node_id, - adev->gmc.xgmi.hive_id, ret); + request_adev->gmc.xgmi.node_id, + request_adev->gmc.xgmi.hive_id, ret); goto out; } - /* Update device pstate */ - adev->pstate = pstate; - - /* - * Update the hive pstate only all devices of the hive - * are in the same pstate - */ - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - if (tmp_adev->pstate != adev->pstate) { - update_hive_pstate = false; - break; - } - } - if (update_hive_pstate || is_high_pstate) + if (init_low) + hive->pstate = hive->hi_req_count ? + hive->pstate : AMDGPU_XGMI_PSTATE_MIN; + else { hive->pstate = pstate; - + hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? + adev : NULL; + } out: mutex_unlock(&hive->hive_lock); - return ret; } @@ -507,9 +514,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) goto exit; } - /* Set default device pstate */ - adev->pstate = -1; - top_info = &adev->psp.xgmi_context.top_info; list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); @@ -604,6 +608,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev) adev->gmc.xgmi.num_physical_nodes == 0) return 0; + amdgpu_xgmi_reset_ras_error_count(adev); + if (!adev->gmc.xgmi.ras_if) { adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); if (!adev->gmc.xgmi.ras_if) @@ -668,6 +674,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, return addr + dram_base_addr; } +static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) +{ + WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); + WREG32_PCIE(pcs_status_reg, 0); +} + +void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) +{ + uint32_t i; + + switch (adev->asic_type) { + case CHIP_ARCTURUS: + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) + pcs_clear_status(adev, + xgmi_pcs_err_status_reg_arct[i]); + break; + case CHIP_VEGA20: + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) + pcs_clear_status(adev, + xgmi_pcs_err_status_reg_vg20[i]); + break; + default: + break; + } +} + static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, uint32_t value, uint32_t *ue_count, @@ -758,6 +790,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, break; } + amdgpu_xgmi_reset_ras_error_count(adev); + err_data->ue_count += ue_cnt; err_data->ce_count += ce_cnt; |