From a6dcf9a7ccfed57abd44c24cc505b559281d44b9 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sat, 11 Mar 2023 17:36:11 +0800 Subject: drm/amdgpu: Move umc ras block init to gmc ras sw_init Initialize umc ras block only when umc ip block supports ras. Driver queries ras capabilities after early_init, ras block init needs to be moved to sw_init. Signed-off-by: Hawking Zhang Reviewed-by: Stanley Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 1b8574bc4463..da68ceaa024c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -208,6 +208,36 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); } +int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) +{ + int err; + struct amdgpu_umc_ras *ras; + + if (!adev->umc.ras) + return 0; + + ras = adev->umc.ras; + + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); + if (err) { + dev_err(adev->dev, "Failed to register umc ras block!\n"); + return err; + } + + strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; + ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; + adev->umc.ras_if = &ras->ras_block.ras_comm; + + if (!ras->ras_block.ras_late_init) + ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init; + + if (ras->ras_block.ras_cb) + ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb; + + return 0; +} + int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; -- cgit From cfa075982768840c468c874219dbec558722cb7f Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Mon, 20 Mar 2023 17:33:47 +0800 Subject: drm/amdgpu: Initialize umc ras callback Fix a coding error which results to null interrupt handler for umc ras. Signed-off-by: Hawking Zhang Reviewed-by: Stanley Yang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index da68ceaa024c..9e2e97207e53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -232,7 +232,7 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) if (!ras->ras_block.ras_late_init) ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init; - if (ras->ras_block.ras_cb) + if (!ras->ras_block.ras_cb) ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb; return 0; -- cgit From e86bd8b21d57670e38c23ed435a053e7e4cc9e21 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Mon, 27 Mar 2023 14:23:30 +0800 Subject: drm/amdgpu: optimize redundant code in umc_v8_10 Optimize redundant code in umc_v8_10 Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 31 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 ++ drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 197 +++++++++++++------------------- 3 files changed, 115 insertions(+), 120 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9e2e97207e53..1edf8e6aeb16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -302,3 +302,34 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, err_data->err_addr_cnt++; } + +int amdgpu_umc_loop_channels(struct amdgpu_device *adev, + umc_func func, void *data) +{ + uint32_t node_inst = 0; + uint32_t umc_inst = 0; + uint32_t ch_inst = 0; + int ret = 0; + + if (adev->umc.node_inst_num) { + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { + ret = func(adev, node_inst, umc_inst, ch_inst, data); + if (ret) { + dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n", + node_inst, umc_inst, ch_inst, ret); + return ret; + } + } + } else { + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { + ret = func(adev, 0, umc_inst, ch_inst, data); + if (ret) { + dev_err(adev->dev, "Umc %d ch %d func returns %d\n", + umc_inst, ch_inst, ret); + return ret; + } + } + } + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index d7f1229ff11f..86133f77a9a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -47,6 +47,10 @@ #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \ LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst)) + +typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst, + uint32_t umc_inst, uint32_t ch_inst, void *data); + struct amdgpu_umc_ras { struct amdgpu_ras_block_object ras_block; void (*err_cnt_init)(struct amdgpu_device *adev); @@ -104,4 +108,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry); int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); + +int amdgpu_umc_loop_channels(struct amdgpu_device *adev, + umc_func func, void *data); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index fb55e8cb9967..d51ae0bc36f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -76,10 +76,13 @@ static inline uint32_t get_umc_v8_10_reg_offset(struct amdgpu_device *adev, UMC_8_NODE_DIST * node_inst; } -static void umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, - uint32_t umc_reg_offset) +static int umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint32_t ecc_err_cnt_addr; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); ecc_err_cnt_addr = SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt); @@ -87,24 +90,14 @@ static void umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, /* clear error count */ WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_10_CE_CNT_INIT); + + return 0; } static void umc_v8_10_clear_error_count(struct amdgpu_device *adev) { - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_clear_error_count_per_channel(adev, - umc_reg_offset); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_clear_error_count_per_channel, NULL); } static void umc_v8_10_query_correctable_error_count(struct amdgpu_device *adev, @@ -147,29 +140,29 @@ static void umc_v8_10_query_uncorrectable_error_count(struct amdgpu_device *adev *error_count += 1; } +static int umc_v8_10_query_ecc_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + struct ras_err_data *err_data = (struct ras_err_data *)data; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); + + umc_v8_10_query_correctable_error_count(adev, + umc_reg_offset, + &(err_data->ce_count)); + umc_v8_10_query_uncorrectable_error_count(adev, + umc_reg_offset, + &(err_data->ue_count)); + + return 0; +} + static void umc_v8_10_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_query_correctable_error_count(adev, - umc_reg_offset, - &(err_data->ce_count)); - umc_v8_10_query_uncorrectable_error_count(adev, - umc_reg_offset, - &(err_data->ue_count)); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_query_ecc_error_count, ras_error_status); umc_v8_10_clear_error_count(adev); } @@ -248,28 +241,28 @@ static void umc_v8_10_convert_error_address(struct amdgpu_device *adev, } } -static void umc_v8_10_query_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, - uint32_t umc_reg_offset, - uint32_t node_inst, - uint32_t ch_inst, - uint32_t umc_inst) +static int umc_v8_10_query_error_address(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint64_t mc_umc_status_addr; uint64_t mc_umc_status, err_addr; uint64_t mc_umc_addrt0; + struct ras_err_data *err_data = (struct ras_err_data *)data; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); if (mc_umc_status == 0) - return; + return 0; if (!err_data->err_addr) { /* clear umc status */ WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); - return; + return 0; } /* calculate error address if ue error is detected */ @@ -287,37 +280,25 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, /* clear umc status */ WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); + + return 0; } static void umc_v8_10_query_ras_error_address(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_query_error_address(adev, - err_data, - umc_reg_offset, - node_inst, - ch_inst, - umc_inst); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_query_error_address, ras_error_status); } -static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, - uint32_t umc_reg_offset) +static int umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt_addr; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCntSel); @@ -332,23 +313,14 @@ static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); /* set error count to initial value */ WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_10_CE_CNT_INIT); + + return 0; } static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev) { - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_err_cnt_init_per_channel(adev, umc_reg_offset); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_err_cnt_init_per_channel, NULL); } static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev) @@ -406,37 +378,35 @@ static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_dev } } +static int umc_v8_10_ecc_info_query_ecc_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + struct ras_err_data *err_data = (struct ras_err_data *)data; + + umc_v8_10_ecc_info_query_correctable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ce_count)); + umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ue_count)); + return 0; +} + static void umc_v8_10_ecc_info_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - - /* TODO: driver needs to toggle DF Cstate to ensure - * safe access of UMC registers. Will add the protection - */ - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_v8_10_ecc_info_query_correctable_error_count(adev, - node_inst, umc_inst, ch_inst, - &(err_data->ce_count)); - umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, - node_inst, umc_inst, ch_inst, - &(err_data->ue_count)); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_ecc_info_query_ecc_error_count, ras_error_status); } -static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, - uint32_t ch_inst, - uint32_t umc_inst, - uint32_t node_inst) +static int umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint32_t eccinfo_table_idx; uint64_t mc_umc_status, err_addr; - + struct ras_err_data *err_data = (struct ras_err_data *)data; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * @@ -447,10 +417,10 @@ static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; if (mc_umc_status == 0) - return; + return 0; if (!err_data->err_addr) - return; + return 0; /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && @@ -463,28 +433,15 @@ static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, umc_v8_10_convert_error_address(adev, err_data, err_addr, ch_inst, umc_inst, node_inst, mc_umc_status); } + + return 0; } static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - - /* TODO: driver needs to toggle DF Cstate to ensure - * safe access of UMC resgisters. Will add the protection - * when firmware interface is ready - */ - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_v8_10_ecc_info_query_error_address(adev, - err_data, - ch_inst, - umc_inst, - node_inst); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_ecc_info_query_error_address, ras_error_status); } const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { -- cgit