diff options
author | Dave Airlie <airlied@redhat.com> | 2022-11-08 16:32:31 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2022-11-08 16:32:31 +1000 |
commit | 49e8e6343df688d68b12c2af50791ca37520f0b7 (patch) | |
tree | 44b193d9161a2fa071e7b1ecb96014e8e73a240a /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | |
parent | 60ba8c5bd94e17ab4b024f5cecf8b48e2cf36412 (diff) | |
parent | fcf00f8d29f2fc6bf00531a1447be28b99073cc3 (diff) |
Merge tag 'amd-drm-next-6.2-2022-11-04' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.2-2022-11-04:
amdgpu:
- Add TMZ support for GC 11.0.1
- More IP version check conversions
- Mode2 reset fixes for sienna cichlid
- SMU 13.x fixes
- RAS enablement on MP 13.x
- Replace kmap with kmap_local_page()
- Misc Clang warning fixes
- SR-IOV fixes for GC 11.x
- PCI AER fix
- DCN 3.2.x commit sequence rework
- SDMA 4.x doorbell fix
- Expose additional new GC 11.x firmware versions
- Misc code cleanups
- S0i3 fixes
- More DC FPU cleanup
- Add more DC kerneldoc
- Misc spelling and grammer fixes
- DCN 3.1.x fixes
- Plane modifier fix
- MCA RAS enablement
- Secure display locking fix
- RAS TA rework
- RAS EEPROM fixes
- Fail suspend if eviction fails
- Drop AMD specific DSC workarounds in favor of drm EDID quirks
- SR-IOV suspend/resume fixes
- Enable DCN support for ARM
- Enable secure display on DCN 2.1
amdkfd:
- Cache size fixes for GC 10.3.x
- kfd_dev struct cleanup
- GC11.x CWSR trap handler fix
- Userptr fixes
- Warning fixes
radeon:
- Replace kmap with kmap_local_page()
UAPI:
- Expose additional new GC 11.x firmware versions via the existing INFO query
drm:
- Add some new EDID DSC quirks
Signed-off-by: Dave Airlie <airlied@redhat.com>
# Conflicts:
# drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221104205827.6008-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 87 |
1 files changed, 73 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..f76c19fc0392 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -22,6 +22,59 @@ */ #include "amdgpu.h" +#include "umc_v6_7.h" + +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + switch (adev->ip_versions[UMC_HWIP][0]) { + case IP_VERSION(6, 7, 0): + umc_v6_7_convert_error_address(adev, + err_data, err_addr, ch_inst, umc_inst); + break; + default: + dev_warn(adev->dev, + "UMC address to Physical address translation is not supported\n"); + return AMDGPU_RAS_FAIL; + } + + return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret = AMDGPU_RAS_FAIL; + + err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if (!err_data.err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for umc error record in MCA notifier!\n"); + return AMDGPU_RAS_FAIL; + } + + /* + * Translate UMC channel address to Physical address + */ + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, + ch_inst, umc_inst); + if (ret) + goto out; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + +out: + kfree(err_data.err_addr); + return ret; +} static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, @@ -112,23 +165,29 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, return AMDGPU_RAS_SUCCESS; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - void *ras_error_status, - bool reset) +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) { - int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + int ret = AMDGPU_RAS_SUCCESS; - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!adev->gmc.xgmi.connected_to_cpu) { + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } + } else if (reset) { + /* MCA poison handler is only responsible for GPU reset, + * let MCA notifier do page retirement. + */ + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); } return ret; |