From cbe4d43ea5e903ae3e8555cc39047b0ed027738a Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 17 Oct 2022 18:22:13 +0800 Subject: drm/amdgpu: add RAS page retirement functions for MCA Define page retirement functions for MCA platform. v2: remove page retirement handling from MCA poison handler, let MCA notifier do page retirement. v3: remove specific poison handler for MCA to simplify code. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..3c83129f4090 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -22,6 +22,59 @@ */ #include "amdgpu.h" +#include "umc_v6_7.h" + +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + switch (adev->ip_versions[UMC_HWIP][0]) { + case IP_VERSION(6, 7, 0): + umc_v6_7_convert_error_address(adev, + err_data, err_addr, ch_inst, umc_inst); + break; + default: + dev_warn(adev->dev, + "UMC address to Physical address translation is not supported\n"); + return AMDGPU_RAS_FAIL; + } + + return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret = AMDGPU_RAS_FAIL; + + err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if (!err_data.err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for umc error record in MCA notifier!\n"); + return AMDGPU_RAS_FAIL; + } + + /* + * Translate UMC channel address to Physical address + */ + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, + ch_inst, umc_inst); + if (ret) + goto out; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + +out: + kfree(err_data.err_addr); + return ret; +} static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, -- cgit From ae45a18b80d9d0d29f0ecfc52fb4e7831671b299 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 17 Oct 2022 18:31:20 +0800 Subject: drm/amdgpu: add RAS poison handling for MCA For MCA poison, if unmap queue fails, only gpu reset should be triggered without page retirement handling, MCA notifier will do it. v2: handle MCA poison consumption in umc_poison_handler directly. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 3c83129f4090..758942150c09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, void *ras_error_status, bool reset) { - int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + int ret = AMDGPU_RAS_SUCCESS; - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!adev->gmc.xgmi.connected_to_cpu) { + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + ret = + amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } + } else if (reset) { + /* MCA poison handler is only responsible for GPU reset, + * let MCA notifier do page retirement. + */ + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); } return ret; -- cgit From 1ed0e176902483e67cd02530d387a7551b0e99a4 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 18 Oct 2022 10:31:09 +0800 Subject: drm/amdgpu: remove ras_error_status parameter for UMC poison handler Make the code simpler. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 +--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 13 +++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 +--- 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 0561812aa0a4..37db39ba8718 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -753,9 +753,7 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) { - struct ras_err_data err_data = {0, 0, 0, NULL}; - - amdgpu_umc_poison_handler(adev, &err_data, reset); + amdgpu_umc_poison_handler(adev, reset); } bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 28463b47ce33..693bce07eb46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * { bool poison_stat = false; struct amdgpu_device *adev = obj->adev; - struct ras_err_data err_data = {0, 0, 0, NULL}; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); @@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * } if (!adev->gmc.xgmi.connected_to_cpu) - amdgpu_umc_poison_handler(adev, &err_data, false); + amdgpu_umc_poison_handler(adev, false); if (block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 758942150c09..f76c19fc0392 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -165,25 +165,22 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, return AMDGPU_RAS_SUCCESS; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - void *ras_error_status, - bool reset) +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) { int ret = AMDGPU_RAS_SUCCESS; if (!adev->gmc.xgmi.connected_to_cpu) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct ras_err_data err_data = {0, 0, 0, NULL}; struct ras_common_if head = { .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; } } else if (reset) { /* MCA poison handler is only responsible for GPU reset, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 659a10de29c9..a6951160f13a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -83,9 +83,7 @@ struct amdgpu_umc { }; int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - void *ras_error_status, - bool reset); +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset); int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); -- cgit