From 61380faa4b4cc577df8a7ff5db5859bac6b351f7 Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 25 Mar 2020 16:01:14 +0800 Subject: drm/amdgpu: disable ras query and iject during gpu reset added flag to ras context to indicate if ras query functionality is ready Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3c32a94d2424..9e9e0f7747b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -80,6 +80,20 @@ atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); +void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) +{ + if (adev) + amdgpu_ras_get_context(adev)->error_query_ready = ready; +} + +bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) +{ + if (adev) + return amdgpu_ras_get_context(adev)->error_query_ready; + + return false; +} + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -281,7 +295,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * struct ras_debug_if data; int ret = 0; - if (amdgpu_ras_intr_triggered()) { + if (!amdgpu_ras_get_error_query_ready(adev)) { DRM_WARN("RAS WARN: error injection currently inaccessible\n"); return size; } @@ -399,7 +413,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, .head = obj->head, }; - if (amdgpu_ras_intr_triggered()) + if (!amdgpu_ras_get_error_query_ready(obj->adev)) return snprintf(buf, PAGE_SIZE, "Query currently inaccessible\n"); @@ -1886,8 +1900,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || adev->in_gpu_reset) + if (adev->in_suspend || adev->in_gpu_reset) { + amdgpu_ras_set_error_query_ready(adev, true); return 0; + } if (ih_info->cb) { r = amdgpu_ras_interrupt_add_handler(adev, ih_info); @@ -1899,6 +1915,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (r) goto sysfs; + amdgpu_ras_set_error_query_ready(adev, true); + return 0; cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); -- cgit From a9d82d2f91297679cfafd7e61c4bccdca6cd550d Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 27 Mar 2020 15:39:06 +0800 Subject: drm/amdgpu: fix non-pointer dereference for non-RAS supported Backtrace on gpu recover test on Navi10. [ 1324.516681] RIP: 0010:amdgpu_ras_set_error_query_ready+0x15/0x20 [amdgpu] [ 1324.523778] Code: 4c 89 f7 e8 cd a2 a0 d8 e9 99 fe ff ff 45 31 ff e9 91 fe ff ff 0f 1f 44 00 00 55 48 85 ff 48 89 e5 74 0e 48 8b 87 d8 2b 01 00 <40> 88 b0 38 01 00 00 5d c3 66 90 0f 1f 44 00 00 55 31 c0 48 85 ff [ 1324.543452] RSP: 0018:ffffaa1040e4bd28 EFLAGS: 00010286 [ 1324.549025] RAX: 0000000000000000 RBX: ffff911198b20000 RCX: 0000000000000000 [ 1324.556217] RDX: 00000000000c0a01 RSI: 0000000000000000 RDI: ffff911198b20000 [ 1324.563514] RBP: ffffaa1040e4bd28 R08: 0000000000001000 R09: ffff91119d0028c0 [ 1324.570804] R10: ffffffff9a606b40 R11: 0000000000000000 R12: 0000000000000000 [ 1324.578413] R13: ffffaa1040e4bd70 R14: ffff911198b20000 R15: 0000000000000000 [ 1324.586464] FS: 00007f4441cbf540(0000) GS:ffff91119ed80000(0000) knlGS:0000000000000000 [ 1324.595434] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1324.601345] CR2: 0000000000000138 CR3: 00000003fcdf8004 CR4: 00000000003606e0 [ 1324.608694] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1324.616303] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1324.623678] Call Trace: [ 1324.626270] amdgpu_device_gpu_recover+0x6e7/0xc50 [amdgpu] [ 1324.632018] ? seq_printf+0x4e/0x70 [ 1324.636652] amdgpu_debugfs_gpu_recover+0x50/0x80 [amdgpu] [ 1324.643371] seq_read+0xda/0x420 [ 1324.647601] full_proxy_read+0x5c/0x90 [ 1324.652426] __vfs_read+0x1b/0x40 [ 1324.656734] vfs_read+0x8e/0x130 [ 1324.660981] ksys_read+0xa7/0xe0 [ 1324.665201] __x64_sys_read+0x1a/0x20 [ 1324.669907] do_syscall_64+0x57/0x1c0 [ 1324.674517] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1324.680654] RIP: 0033:0x7f44417cf081 Signed-off-by: Evan Quan Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9e9e0f7747b7..8a78db648442 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -82,13 +82,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) { - if (adev) + if (adev && amdgpu_ras_get_context(adev)) amdgpu_ras_get_context(adev)->error_query_ready = ready; } bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) { - if (adev) + if (adev && amdgpu_ras_get_context(adev)) return amdgpu_ras_get_context(adev)->error_query_ready; return false; -- cgit From b3dbd6d3ec495057db425a09516a922e1dacec33 Mon Sep 17 00:00:00 2001 From: John Clements Date: Tue, 7 Apr 2020 15:08:15 +0800 Subject: drm/amdgpu: resolve mGPU RAS query instability upon receiving uncorrectable error, query every GPU node for ras errors Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8a78db648442..b0aa4e1ed4df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1438,12 +1438,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = container_of(work, struct amdgpu_ras, recovery_work); + struct amdgpu_device *remote_adev = NULL; + struct amdgpu_device *adev = ras->adev; + struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + + /* Build list of devices to query RAS related errors */ + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { + device_list_handle = &hive->device_list; + } else { + list_add_tail(&adev->gmc.xgmi.head, &device_list); + device_list_handle = &device_list; + } - /* - * Query and print non zero error counter per IP block for - * awareness before recovering GPU. - */ - amdgpu_ras_log_on_err_counter(ras->adev); + list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { + amdgpu_ras_log_on_err_counter(remote_adev); + } if (amdgpu_device_should_recover_gpu(ras->adev)) amdgpu_device_gpu_recover(ras->adev, 0); -- cgit From 6952e99cfd52d32098540fe8d9e592828b9e774c Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 10 Apr 2020 15:51:14 +0800 Subject: drm/amdgpu: refine ras related message print Prefix ras related kernel message logging with PCI device info by replacing DRM_INFO/WARN/ERROR with dev_info/warn/err. This can clearly tell user about GPU device information where ras is. And add some other ras message printing to make it more clear and friendly as well. Suggested-by: Hawking Zhang Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 51 ++++++++++++++++++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 10 ++++--- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 16 +++++++---- 3 files changed, 48 insertions(+), 29 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0aa4e1ed4df..73ae913aee26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -296,7 +296,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * int ret = 0; if (!amdgpu_ras_get_error_query_ready(adev)) { - DRM_WARN("RAS WARN: error injection currently inaccessible\n"); + dev_warn(adev->dev, "RAS WARN: error injection " + "currently inaccessible\n"); return size; } @@ -324,7 +325,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* umc ce/ue error injection for a bad page is not allowed */ if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && amdgpu_ras_check_bad_page(adev, data.inject.address)) { - DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n", + dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked " + "as bad before error injection!\n", data.inject.address); break; } @@ -590,7 +592,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, &info, enable); if (ret) { - DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", + dev_err(adev->dev, "RAS ERROR: %s %s feature " + "failed ret %d\n", enable ? "enable":"disable", ras_block_str(head->block), ret); @@ -632,7 +635,8 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (ret == -EINVAL) { ret = __amdgpu_ras_feature_enable(adev, head, 1); if (!ret) - DRM_INFO("RAS INFO: %s setup object\n", + dev_info(adev->dev, + "RAS INFO: %s setup object\n", ras_block_str(head->block)); } } else { @@ -758,12 +762,17 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - dev_info(adev->dev, "%ld correctable errors detected in %s block\n", - obj->err_data.ce_count, ras_block_str(info->head.block)); + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + obj->err_data.ce_count, + ras_block_str(info->head.block)); } if (err_data.ue_count) { - dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", - obj->err_data.ue_count, ras_block_str(info->head.block)); + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + obj->err_data.ue_count, + ras_block_str(info->head.block)); } return 0; @@ -807,13 +816,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = psp_ras_trigger_error(&adev->psp, &block_info); break; default: - DRM_INFO("%s error injection is not supported yet\n", + dev_info(adev->dev, "%s error injection is not supported yet\n", ras_block_str(info->head.block)); ret = -EINVAL; } if (ret) - DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", + dev_err(adev->dev, "RAS ERROR: inject %s error failed ret %d\n", ras_block_str(info->head.block), ret); @@ -1549,7 +1558,7 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) &data->bps[control->num_recs], true, save_count)) { - DRM_ERROR("Failed to save EEPROM table data!"); + dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } @@ -1577,7 +1586,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (amdgpu_ras_eeprom_process_recods(control, bps, false, control->num_recs)) { - DRM_ERROR("Failed to load EEPROM table records!"); + dev_err(adev->dev, "Failed to load EEPROM table records!"); ret = -EIO; goto out; } @@ -1651,7 +1660,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) AMDGPU_GPU_PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, &bo, NULL)) - DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); + dev_warn(adev->dev, "RAS WARN: reserve vram for " + "retired page %llx fail\n", bp); data->bps_bo[i] = bo; data->last_reserved = i + 1; @@ -1739,7 +1749,7 @@ free: kfree(*data); con->eh_data = NULL; out: - DRM_WARN("Failed to initialize ras recovery!\n"); + dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); return ret; } @@ -1801,18 +1811,18 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, return; if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { - DRM_INFO("HBM ECC is active.\n"); + dev_info(adev->dev, "HBM ECC is active.\n"); *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else - DRM_INFO("HBM ECC is not presented.\n"); + dev_info(adev->dev, "HBM ECC is not presented.\n"); if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { - DRM_INFO("SRAM ECC is active.\n"); + dev_info(adev->dev, "SRAM ECC is active.\n"); *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else - DRM_INFO("SRAM ECC is not presented.\n"); + dev_info(adev->dev, "SRAM ECC is not presented.\n"); /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; @@ -1869,7 +1879,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (amdgpu_ras_fs_init(adev)) goto fs_out; - DRM_INFO("RAS INFO: ras initialized successfully, " + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); return 0; @@ -2055,7 +2065,8 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + dev_info(adev->dev, "uncorrectable hardware error" + "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 8ffa015bc1dc..af1b1ccf613c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -110,7 +110,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, * even NOMEM error is encountered */ if(!err_data->err_addr) - DRM_WARN("Failed to alloc memory for umc error address record!\n"); + dev_warn(adev->dev, "Failed to alloc memory for " + "umc error address record!\n"); /* umc query_ras_error_address is also responsible for clearing * error status @@ -120,13 +121,14 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, /* only uncorrectable error needs gpu reset */ if (err_data->ue_count) { - dev_info(adev->dev, "%ld uncorrectable errors detected in UMC block\n", - err_data->ue_count); + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in UMC block\n", + err_data->ue_count); if (err_data->err_addr_cnt && amdgpu_ras_add_bad_pages(adev, err_data->err_addr, err_data->err_addr_cnt)) - DRM_WARN("Failed to add ras bad page!\n"); + dev_warn(adev->dev, "Failed to add ras bad page!\n"); amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 72bdd06ed508..e629156173d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -323,14 +323,20 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device obj->err_data.ce_count += err_data.ce_count; if (err_data.ce_count) - DRM_INFO("%ld correctable errors detected in %s block\n", - obj->err_data.ce_count, adev->nbio.ras_if->name); + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + adev->nbio.ras_if->name); if (err_data.ue_count) - DRM_INFO("%ld uncorrectable errors detected in %s block\n", - obj->err_data.ue_count, adev->nbio.ras_if->name); + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + adev->nbio.ras_if->name); - DRM_WARN("RAS controller interrupt triggered by NBIF error\n"); + dev_info(adev->dev, "RAS controller interrupt triggered " + "by NBIF error\n"); /* ras_controller_int is dedicated for nbif ras error, * not the global interrupt for sync flood -- cgit From 12c17b9d62663c14a5343d6742682b3e67280754 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 16 Apr 2020 23:41:07 +0800 Subject: drm/amdgpu: fix kernel page fault issue by ras recovery on sGPU When running ras uncorrectable error injection and triggering GPU reset on sGPU, below issue is observed. It's caused by the list uninitialized when accessing. [ 80.047227] BUG: unable to handle page fault for address: ffffffffc0f4f750 [ 80.047300] #PF: supervisor write access in kernel mode [ 80.047351] #PF: error_code(0x0003) - permissions violation [ 80.047404] PGD 12c20e067 P4D 12c20e067 PUD 12c210067 PMD 41c4ee067 PTE 404316061 [ 80.047477] Oops: 0003 [#1] SMP PTI [ 80.047516] CPU: 7 PID: 377 Comm: kworker/7:2 Tainted: G OE 5.4.0-rc7-guchchen #1 [ 80.047594] Hardware name: System manufacturer System Product Name/TUF Z370-PLUS GAMING II, BIOS 0411 09/21/2018 [ 80.047888] Workqueue: events amdgpu_ras_do_recovery [amdgpu] Signed-off-by: Guchun Chen Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 73ae913aee26..68b82f7b0b80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1453,9 +1453,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); /* Build list of devices to query RAS related errors */ - if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) device_list_handle = &hive->device_list; - } else { + else { + INIT_LIST_HEAD(&device_list); list_add_tail(&adev->gmc.xgmi.head, &device_list); device_list_handle = &device_list; } -- cgit From a891d239f9e036031f9f1c62fe584232662cb7f1 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 22 Apr 2020 12:22:54 +0800 Subject: drm/amdgpu: set error query ready after all IPs late init If set error query ready in amdgpu_ras_late_init, which will cause some IP blocks aren't initialized, but their error query is ready. Signed-off-by: Dennis Li Reviewed-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e4527c64279..f9b315e7e004 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2216,6 +2216,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + amdgpu_ras_set_error_query_ready(adev, true); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 68b82f7b0b80..8b14aee370c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1921,10 +1921,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || adev->in_gpu_reset) { - amdgpu_ras_set_error_query_ready(adev, true); + if (adev->in_suspend || adev->in_gpu_reset) return 0; - } if (ih_info->cb) { r = amdgpu_ras_interrupt_add_handler(adev, ih_info); @@ -1936,8 +1934,6 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (r) goto sysfs; - amdgpu_ras_set_error_query_ready(adev, true); - return 0; cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); -- cgit