diff options
author | Dave Airlie <airlied@redhat.com> | 2020-02-28 15:40:26 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2020-02-28 15:40:26 +1000 |
commit | a2ae604da74dcf9ae674d3c03efad80574952800 (patch) | |
tree | 0e232e6cbb76c0bc48d315d5d8269de108acceef /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 4825b61a3d39eceef7db723808103aa60fc24520 (diff) | |
parent | c6385e503aeaf99511bc5f67c4cdf5d7926df45b (diff) |
Merge tag 'amd-drm-next-5.7-2020-02-26' of git://people.freedesktop.org/~agd5f/linux into drm-next
amd-drm-next-5.7-2020-02-26:
amdgpu:
- Rework VM update handling in preparation for HMM support
- HDCP srm support
- PSR fixes
- DC watermark fixes
- OLED panel support
- SR-IOV fixes
- BACO fixes
- Optimize debugging vram access
- RAS fixes
- Use BACO for runtime pm
- HDCP fixes
- XGMI fixes
- DDC fixes
- DC clock programming optimizations and fixes
- PSP fw loading sequence updates
- Drop DRIVER_USE_AGP
- Remove legacy drm load and unload callbacks
amdkfd:
- Add runtime pm support
radeon:
- Drop DRIVER_USE_AGP
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200227043142.4075-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 53 |
1 files changed, 37 insertions, 16 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index cef94e2169fe..930633a0ed64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -31,6 +31,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" #include "amdgpu_atomfirmware.h" +#include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" const char *ras_error_string[] = { @@ -742,20 +743,6 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, return 0; } -uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr) -{ - uint32_t df_inst_id; - - if ((!adev->df.funcs) || - (!adev->df.funcs->get_df_inst_id) || - (!adev->df.funcs->get_dram_base_addr)) - return addr; - - df_inst_id = adev->df.funcs->get_df_inst_id(adev); - - return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id); -} - /* wrapper of psp_ras_trigger_error */ int amdgpu_ras_error_inject(struct amdgpu_device *adev, struct ras_inject_if *info) @@ -775,8 +762,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, /* Calculate XGMI relative offset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { - block_info.address = get_xgmi_relative_phy_addr(adev, - block_info.address); + block_info.address = + amdgpu_xgmi_get_relative_phy_addr(adev, + block_info.address); } switch (info->head.block) { @@ -1319,6 +1307,33 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) } /* ih end */ +/* traversal all IPs except NBIO to query error counter */ +static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_manager *obj; + + if (!con) + return; + + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; + + /* + * PCIE_BIF IP has one different isr by ras controller + * interrupt, the specific ras counter query will be + * done in that isr. So skip such block from common + * sync flood interrupt isr calling. + */ + if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) + continue; + + amdgpu_ras_error_query(adev, &info); + } +} + /* recovery begin */ /* return 0 on success. @@ -1373,6 +1388,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_ras *ras = container_of(work, struct amdgpu_ras, recovery_work); + /* + * Query and print non zero error counter per IP block for + * awareness before recovering GPU. + */ + amdgpu_ras_log_on_err_counter(ras->adev); + if (amdgpu_device_should_recover_gpu(ras->adev)) amdgpu_device_gpu_recover(ras->adev, 0); atomic_set(&ras->in_recovery, 0); |