aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorLinus Torvalds <[email protected]>2020-04-01 15:24:20 -0700
committerLinus Torvalds <[email protected]>2020-04-01 15:24:20 -0700
commitf365ab31efacb70bed1e821f7435626e0b2528a6 (patch)
treee1374b2896d50e652c1e434d70e834d0788aae3a /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent4646de87d32526ee87b46c2e0130413367fb5362 (diff)
parent59e7a8cc2dcf335116d500d684bfb34d1d97a6fe (diff)
Merge tag 'drm-next-2020-04-01' of git://anongit.freedesktop.org/drm/drm
Pull drm updates from Dave Airlie: "This is the main drm pull request for 5.7-rc1. Highlights: - i915 enables Tigerlake by default - i915 and amdgpu have initial OLED backlight support [ Jani Nikula pipes up and points out that we've had a bunch of "initial support" code for a long time already, but only now Lyude made it actually work on real world machines ] - vmwgfx add support to enable OpenGL 4 userspace - zero length arrays are mostly removed. Detailed summary: new driver: - tidss: TI Keystone platform display subsystem core: - new drm device warn macros - mode config valid for memory constrained devices - bridge bus format negotation - consolidated fake vblank event handling - dma_alloc related cleanups - drop get_crtc callback - dp: DP1.4 EDID corruption test - EDID CEA detailed timings improvements - relicense some code to dual GPL2/MIT - convert core vblank support to per-crtc support - rework drm_global_mutex - bridge rework to allow omap_dss custom driver removeal - remove drm_fb_helper connector interrfaces - zero-length array removal scheduler: - support for modifying the sched list - revert job distribution optimization - helper to pick least loaded scheduler - race condition fix mst: - various fixes - remove register_connector callback i915: - uapi to allows userspace specific CS ring buffer sizes - Tigerlake enablement patches + Tigerlake enabled by default - new sysfs entries for engine properties - display/logging refactors - eDP/DP fixes for DPCD - Gen7 back to aliasing-ppgtt - Gen8+ irq refactor - Avoid globals - GEM locking fixes and simplifications - Ice Lake and Elkhart Lake fixes and workarounds - Baytrail/Haswell instability fix - GVT - VFIO edid better support amdgpu: - Rework VM update handling in preparation for HMM support - drm load/unload removal fixups - USB-C PD firmware updates - HDCP srm support - Navi/renoir PM watermark fixes - OLED panel support - Optimize debugging vram access - Use BACO for runtime pm - DC clock programming optimizations and fixes - PSP fw loading sequence updates - Drop DRIVER_USE_AGP - Remove legacy drm load and unload callbacks - ACP Kconfig fix - Lots of fixes across the driver amdkfd: - runtime pm support - more gfx config details in amdgpu radeon: - drop DRIVER_USE_AGP vmwgfx: - Disable DMA when SEV encryption in use - Shader Model 5 support - needed for GL4 support msm: - DPU resource manager refactor - dpu using atomic global state mediatek: - MT8183 DPI support etnaviv: - out-of-bounds read fix - expose feature flags for GC400 STM32MP1 SoC - runtime suspend entry fix - dma32 zone fix hisilicon: - mode selection fixes meson: - YUV420 support lima: - add support for heap buffers tinydrm: - removal of owner field - explicit DT dependency removal - YAML schema conversion tegra: - misc cleanups tidss: - new driver virtio: - better batching of notifications to host - memory handling reworked - shmem + gpu context fixes hibmc: - add gamma_set support - improve DPMS support pl111: - Integrator IM-PD1 support sun4i: - LVDS support for A20 + A33 - DSI panel handling improvements" * tag 'drm-next-2020-04-01' of git://anongit.freedesktop.org/drm/drm: (1537 commits) drm/i915/display: Fix mode private_flags comparison at atomic_check drm/i915/gt: Stage the transfer of the virtual breadcrumb drm/i915/gt: Select the deepest available parking mode for rc6 drm/i915: Avoid live-lock with i915_vma_parked() drm/i915/gt: Treat idling as a RPS downclock event drm/i915/gt: Cancel a hung context if already closed drm/i915: Use explicit flag to mark unreachable intel_context drm/amdgpu: don't try to reserve training bo for sriov (v2) drm/amdgpu/smu11: add support for SMU AC/DC interrupts drm/amdgpu/swSMU: handle manual AC/DC notifications drm/amdgpu/swSMU: handle DC controlled by GPIO for navi1x drm/amdgpu/swSMU: set AC/DC mode based on the current system state (v2) drm/amdgpu/swSMU: correct the bootup power source for Navi1X (v2) drm/amdgpu/swSMU: use the smu11 power source helper for navi1x drm/amdgpu/smu11: add a helper to set the power source drm/amd/swSMU: add callback to set AC/DC power source (v2) drm/scheduler: fix rare NULL ptr race drm/amdgpu: fix the coverage issue to clear ArcVPGRs drm/amd/display: Fix pageflip event race condition for DCN. drm/[radeon|amdgpu]: Remove HAINAN board from max_sclk override check ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c120
1 files changed, 93 insertions, 27 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index cef94e2169fe..3c32a94d2424 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -31,6 +31,7 @@
#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_atomfirmware.h"
+#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
const char *ras_error_string[] = {
@@ -280,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
struct ras_debug_if data;
int ret = 0;
+ if (amdgpu_ras_intr_triggered()) {
+ DRM_WARN("RAS WARN: error injection currently inaccessible\n");
+ return size;
+ }
+
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
if (ret)
return -EINVAL;
@@ -393,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
.head = obj->head,
};
+ if (amdgpu_ras_intr_triggered())
+ return snprintf(buf, PAGE_SIZE,
+ "Query currently inaccessible\n");
+
if (amdgpu_ras_error_query(obj->adev, &info))
return -EINVAL;
@@ -720,6 +730,9 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
if (adev->nbio.funcs->query_ras_error_count)
adev->nbio.funcs->query_ras_error_count(adev, &err_data);
break;
+ case AMDGPU_RAS_BLOCK__XGMI_WAFL:
+ amdgpu_xgmi_query_ras_error_count(adev, &err_data);
+ break;
default:
break;
}
@@ -742,20 +755,6 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
return 0;
}
-uint64_t get_xgmi_relative_phy_addr(struct amdgpu_device *adev, uint64_t addr)
-{
- uint32_t df_inst_id;
-
- if ((!adev->df.funcs) ||
- (!adev->df.funcs->get_df_inst_id) ||
- (!adev->df.funcs->get_dram_base_addr))
- return addr;
-
- df_inst_id = adev->df.funcs->get_df_inst_id(adev);
-
- return addr + adev->df.funcs->get_dram_base_addr(adev, df_inst_id);
-}
-
/* wrapper of psp_ras_trigger_error */
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info)
@@ -775,8 +774,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
/* Calculate XGMI relative offset */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
- block_info.address = get_xgmi_relative_phy_addr(adev,
- block_info.address);
+ block_info.address =
+ amdgpu_xgmi_get_relative_phy_addr(adev,
+ block_info.address);
}
switch (info->head.block) {
@@ -1122,6 +1122,32 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
&amdgpu_ras_debugfs_ops);
}
+void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_manager *obj;
+ struct ras_fs_if fs_info;
+
+ /*
+ * it won't be called in resume path, no need to check
+ * suspend and gpu reset status
+ */
+ if (!con)
+ return;
+
+ amdgpu_ras_debugfs_create_ctrl_node(adev);
+
+ list_for_each_entry(obj, &con->head, node) {
+ if (amdgpu_ras_is_supported(adev, obj->head.block) &&
+ (obj->attr_inuse == 1)) {
+ sprintf(fs_info.debugfs_name, "%s_err_inject",
+ ras_block_str(obj->head.block));
+ fs_info.head = obj->head;
+ amdgpu_ras_debugfs_create(adev, &fs_info);
+ }
+ }
+}
+
void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
struct ras_common_if *head)
{
@@ -1154,7 +1180,6 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
{
amdgpu_ras_sysfs_create_feature_node(adev);
- amdgpu_ras_debugfs_create_ctrl_node(adev);
return 0;
}
@@ -1319,6 +1344,33 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
}
/* ih end */
+/* traversal all IPs except NBIO to query error counter */
+static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct ras_manager *obj;
+
+ if (!con)
+ return;
+
+ list_for_each_entry(obj, &con->head, node) {
+ struct ras_query_if info = {
+ .head = obj->head,
+ };
+
+ /*
+ * PCIE_BIF IP has one different isr by ras controller
+ * interrupt, the specific ras counter query will be
+ * done in that isr. So skip such block from common
+ * sync flood interrupt isr calling.
+ */
+ if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
+ continue;
+
+ amdgpu_ras_error_query(adev, &info);
+ }
+}
+
/* recovery begin */
/* return 0 on success.
@@ -1373,6 +1425,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_ras *ras =
container_of(work, struct amdgpu_ras, recovery_work);
+ /*
+ * Query and print non zero error counter per IP block for
+ * awareness before recovering GPU.
+ */
+ amdgpu_ras_log_on_err_counter(ras->adev);
+
if (amdgpu_device_should_recover_gpu(ras->adev))
amdgpu_device_gpu_recover(ras->adev, 0);
atomic_set(&ras->in_recovery, 0);
@@ -1713,18 +1771,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
*hw_supported = 0;
*supported = 0;
- if (amdgpu_sriov_vf(adev) ||
+ if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
(adev->asic_type != CHIP_VEGA20 &&
adev->asic_type != CHIP_ARCTURUS))
return;
- if (adev->is_atom_fw &&
- (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
- amdgpu_atomfirmware_sram_ecc_supported(adev)))
- *hw_supported = AMDGPU_RAS_BLOCK_MASK;
+ if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+ DRM_INFO("HBM ECC is active.\n");
+ *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else
+ DRM_INFO("HBM ECC is not presented.\n");
+
+ if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+ DRM_INFO("SRAM ECC is active.\n");
+ *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else
+ DRM_INFO("SRAM ECC is not presented.\n");
+
+ /* hw_supported needs to be aligned with RAS block mask. */
+ *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
*supported = amdgpu_ras_enable == 0 ?
- 0 : *hw_supported & amdgpu_ras_mask;
+ 0 : *hw_supported & amdgpu_ras_mask;
}
int amdgpu_ras_init(struct amdgpu_device *adev)
@@ -1825,8 +1895,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
goto interrupt;
}
- amdgpu_ras_debugfs_create(adev, fs_info);
-
r = amdgpu_ras_sysfs_create(adev, fs_info);
if (r)
goto sysfs;
@@ -1835,7 +1903,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
sysfs:
- amdgpu_ras_debugfs_remove(adev, ras_block);
if (ih_info->cb)
amdgpu_ras_interrupt_remove_handler(adev, ih_info);
interrupt:
@@ -1852,7 +1919,6 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
return;
amdgpu_ras_sysfs_remove(adev, ras_block);
- amdgpu_ras_debugfs_remove(adev, ras_block);
if (ih_info->cb)
amdgpu_ras_interrupt_remove_handler(adev, ih_info);
amdgpu_ras_feature_enable(adev, ras_block, 0);