aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c38
1 files changed, 25 insertions, 13 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 46910e7b2927..8f47c14ecbc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -867,9 +867,9 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
/* feature ctl end */
-void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
- struct ras_common_if *ras_block,
- struct ras_err_data *err_data)
+static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block,
+ struct ras_err_data *err_data)
{
switch (ras_block->sub_block_index) {
case AMDGPU_RAS_MCA_BLOCK__MP0:
@@ -1161,9 +1161,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
/**
* amdgpu_ras_query_error_count -- Get error counts of all IPs
- * adev: pointer to AMD GPU device
- * ce_count: pointer to an integer to be set to the count of correctible errors.
- * ue_count: pointer to an integer to be set to the count of uncorrectible
+ * @adev: pointer to AMD GPU device
+ * @ce_count: pointer to an integer to be set to the count of correctible errors.
+ * @ue_count: pointer to an integer to be set to the count of uncorrectible
* errors.
*
* If set, @ce_count or @ue_count, count and return the corresponding
@@ -1592,6 +1592,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
/* Let IP handle its data, maybe we need get the output
* from the callback to udpate the error type/count, etc
*/
+ memset(&err_data, 0, sizeof(err_data));
ret = data->cb(obj->adev, &err_data, &entry);
/* ue will trigger an interrupt, and in that case
* we need do a reset to recovery the whole system.
@@ -1747,6 +1748,16 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
continue;
+ /*
+ * this is a workaround for aldebaran, skip send msg to
+ * smu to get ecc_info table due to smu handle get ecc
+ * info table failed temporarily.
+ * should be removed until smu fix handle ecc_info table.
+ */
+ if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
+ (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
+ continue;
+
amdgpu_ras_query_error_status(adev, &info);
}
}
@@ -1828,8 +1839,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
.size = AMDGPU_GPU_PAGE_SIZE,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
- status = amdgpu_vram_mgr_query_page_status(
- ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+ status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
data->bps[i].retired_page);
if (status == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
@@ -1930,8 +1940,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
- amdgpu_vram_mgr_reserve_range(
- ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+ amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
AMDGPU_GPU_PAGE_SIZE);
@@ -2362,7 +2371,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
}
/* Init poison supported flag, the default value is false */
- if (adev->df.funcs &&
+ if (adev->gmc.xgmi.connected_to_cpu) {
+ /* enabled by default when GPU is connected to CPU */
+ con->poison_supported = true;
+ }
+ else if (adev->df.funcs &&
adev->df.funcs->query_ras_poison_mode &&
adev->umc.ras_funcs &&
adev->umc.ras_funcs->query_ras_poison_mode) {
@@ -2503,7 +2516,6 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
amdgpu_ras_sysfs_remove(adev, ras_block);
if (ih_info->cb)
amdgpu_ras_interrupt_remove_handler(adev, ih_info);
- amdgpu_ras_feature_enable(adev, ras_block, 0);
}
/* do some init work after IP late init as dependence.
@@ -2673,7 +2685,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
* and error occurred in DramECC (Extended error code = 0) then only
* process the error, else bail out.
*/
- if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) &&
+ if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
(XEC(m->status, 0x3f) == 0x0)))
return NOTIFY_DONE;