From 6492e1b07c03397f85bd6dc0e230ea6cd9394635 Mon Sep 17 00:00:00 2001 From: yipechai Date: Tue, 4 Jan 2022 13:35:37 +0800 Subject: drm/amdgpu: Unify ras block interface for each ras block 1. Define unified ops interface for each block. 2. Add ras_block_match function pointer in ops interface, each ras block can customize specail match function to identify itself. 3. Add amdgpu_ras_block_match_default new function. If a ras block doesn't define .ras_block_match, default execute amdgpu_ras_block_match_default to identify this ras block. 4. Define unified basic ras block data for each ras block. 5. Create dedicated amdgpu device ras block link list to manage all of the ras blocks. 6. Add amdgpu_ras_register_ras_block new function interface for each ras block to register itself to ras controlling block. Signed-off-by: yipechai Reviewed-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 1c708122d492..f66122fdf477 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -484,6 +484,33 @@ struct ras_debug_if { }; int op; }; + +struct amdgpu_ras_block_object { + /* block name */ + char name[32]; + + enum amdgpu_ras_block block; + + uint32_t sub_block_index; + + /* ras block link */ + struct list_head node; + + int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); + int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); + void (*ras_fini)(struct amdgpu_device *adev); + const struct amdgpu_ras_block_hw_ops *hw_ops; +}; + +struct amdgpu_ras_block_hw_ops { + int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); + void (*query_ras_error_count)(struct amdgpu_device *adev,void *ras_error_status); + void (*query_ras_error_status)(struct amdgpu_device *adev); + void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); + void (*reset_ras_error_count)(struct amdgpu_device *adev); + void (*reset_ras_error_status)(struct amdgpu_device *adev); +}; + /* work flow * vbios * 1: ras feature enable (enabled by default) @@ -667,4 +694,5 @@ const char *get_ras_block_str(struct ras_common_if *ras_block); bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj); #endif -- cgit From 7cab2124058d2f5f048f435a4631e176dcd1430d Mon Sep 17 00:00:00 2001 From: yipechai Date: Tue, 4 Jan 2022 13:39:15 +0800 Subject: drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h. v2: squash in forward declaration warning fix (Alex) Signed-off-by: yipechai Reviewed-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 34 +++++++++------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- 4 files changed, 51 insertions(+), 25 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c349337939dd..445a0d077c1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -108,6 +108,7 @@ #include "amdgpu_smuio.h" #include "amdgpu_fdinfo.h" #include "amdgpu_mca.h" +#include "amdgpu_ras.h" #define MAX_GPU_INSTANCE 16 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bf2983fe5d9d..5b3f4beb2149 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2810,6 +2810,45 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) } } #endif + +struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev) +{ + if (!adev) + return NULL; + + return adev->psp.ras_context.ras; +} + +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras* ras_con) +{ + if (!adev) + return -EINVAL;; + + adev->psp.ras_context.ras = ras_con; + return 0; +} + +/* check if ras is supported on block, say, sdma, gfx */ +int amdgpu_ras_is_supported(struct amdgpu_device *adev, + unsigned int block) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (block >= AMDGPU_RAS_BLOCK_COUNT) + return 0; + return ras && (adev->ras_enabled & (1 << block)); +} + +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + schedule_work(&ras->recovery_work); + return 0; +} + + /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index f66122fdf477..7a4d82378205 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -26,11 +26,11 @@ #include #include -#include "amdgpu.h" -#include "amdgpu_psp.h" #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" +struct amdgpu_iv_entry; + #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) enum amdgpu_ras_block { @@ -525,19 +525,6 @@ struct amdgpu_ras_block_hw_ops { * 8: feature disable */ -#define amdgpu_ras_get_context(adev) ((adev)->psp.ras_context.ras) -#define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras_context.ras = (ras_con)) - -/* check if ras is supported on block, say, sdma, gfx */ -static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, - unsigned int block) -{ - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - - if (block >= AMDGPU_RAS_BLOCK_COUNT) - return 0; - return ras && (adev->ras_enabled & (1 << block)); -} int amdgpu_ras_recovery_init(struct amdgpu_device *adev); @@ -554,15 +541,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev); -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) -{ - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - - if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) - schedule_work(&ras->recovery_work); - return 0; -} - static inline enum ta_ras_block amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { switch (block) { @@ -694,5 +672,13 @@ const char *get_ras_block_str(struct ras_common_if *ras_block); bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); +int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block); + +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); + +struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); + +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras* ras_con); + int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 25951b2d83b6..e81ce465ff3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -21,7 +21,7 @@ * */ -#include "amdgpu_ras.h" +#include "amdgpu.h" static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, -- cgit From b6efdb02d23ef615464cd0077c211b40a1faca26 Mon Sep 17 00:00:00 2001 From: yipechai Date: Fri, 14 Jan 2022 10:24:30 +0800 Subject: drm/amdgpu: Fix the code style warnings in amdgpu_ras MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the code style warnings in amdgpu_ras: 1. ERROR: space required before the open parenthesis '('. 2. WARNING: line length of xxx exceeds 100 columns. 3. ERROR: "foo* bar" should be "foo *bar". 4. WARNING: unnecessary whitespace before a quoted newline. 5. WARNING: space prohibited before semicolon. 6. WARNING: suspect code indent for conditional statements. 7. WARNING: braces {} are not necessary for single statement blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++++-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 ++++---- 2 files changed, 30 insertions(+), 21 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3538032e40d5..7a1d2bac698e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -872,7 +872,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block) { - if(!block_obj) + if (!block_obj) return -EINVAL; if (block_obj->block == block) @@ -881,7 +881,7 @@ static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_ return -EINVAL; } -static struct amdgpu_ras_block_object* amdgpu_ras_get_ras_block(struct amdgpu_device *adev, +static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint32_t sub_block_index) { struct amdgpu_ras_block_object *obj, *tmp; @@ -941,7 +941,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) { - struct amdgpu_ras_block_object* block_obj = NULL; + struct amdgpu_ras_block_object *block_obj = NULL; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -953,7 +953,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } else { block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); if (!block_obj || !block_obj->hw_ops) { - dev_info(adev->dev, "%s doesn't config ras function \n", + dev_info(adev->dev, "%s doesn't config ras function.\n", get_ras_block_str(&info->head)); return -EINVAL; } @@ -1023,13 +1023,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, enum amdgpu_ras_block block) { - struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); if (!amdgpu_ras_is_supported(adev, block)) return -EINVAL; if (!block_obj || !block_obj->hw_ops) { - dev_info(adev->dev, "%s doesn't config ras function \n", ras_block_str(block)); + dev_info(adev->dev, "%s doesn't config ras function.\n", + ras_block_str(block)); return -EINVAL; } @@ -1066,7 +1067,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, return -EINVAL; if (!block_obj || !block_obj->hw_ops) { - dev_info(adev->dev, "%s doesn't config ras function \n", get_ras_block_str(&info->head)); + dev_info(adev->dev, "%s doesn't config ras function.\n", + get_ras_block_str(&info->head)); return -EINVAL; } @@ -1702,19 +1704,25 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, struct ras_query_if *info) { - struct amdgpu_ras_block_object* block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index); + struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); /* * Only two block need to query read/write * RspStatus at current state */ if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) - return ; + return; + + block_obj = amdgpu_ras_get_ras_block(adev, + info->head.block, + info->head.sub_block_index); - block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, info->head.sub_block_index); if (!block_obj || !block_obj->hw_ops) { - dev_info(adev->dev, "%s doesn't config ras function \n", get_ras_block_str(&info->head)); - return ; + dev_info(adev->dev, "%s doesn't config ras function.\n", + get_ras_block_str(&info->head)); + return; } if (block_obj->hw_ops->query_ras_error_status) @@ -2715,7 +2723,7 @@ static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) } #endif -struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev) +struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) { if (!adev) return NULL; @@ -2723,7 +2731,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev) return adev->psp.ras_context.ras; } -int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras* ras_con) +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) { if (!adev) return -EINVAL; @@ -2755,7 +2763,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) /* Register each ip ras block into amdgpu ras */ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, - struct amdgpu_ras_block_object* ras_block_obj) + struct amdgpu_ras_block_object *ras_block_obj) { struct amdgpu_ras_block_object *obj, *tmp; if (!adev || !ras_block_obj) @@ -2766,9 +2774,8 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, /* If the ras object is in ras_list, don't add it again */ list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) { - if (obj == ras_block_obj) { + if (obj == ras_block_obj) return 0; - } } INIT_LIST_HEAD(&ras_block_obj->node); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7a4d82378205..a51a281bd91a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -496,7 +496,8 @@ struct amdgpu_ras_block_object { /* ras block link */ struct list_head node; - int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); + int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, + enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); void (*ras_fini)(struct amdgpu_device *adev); const struct amdgpu_ras_block_hw_ops *hw_ops; @@ -504,7 +505,7 @@ struct amdgpu_ras_block_object { struct amdgpu_ras_block_hw_ops { int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if); - void (*query_ras_error_count)(struct amdgpu_device *adev,void *ras_error_status); + void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); void (*reset_ras_error_count)(struct amdgpu_device *adev); @@ -678,7 +679,8 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); -int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras* ras_con); +int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); -int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object* ras_block_obj); +int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, + struct amdgpu_ras_block_object *ras_block_obj); #endif -- cgit From d5e8ff5f7b2a41d503914d4896ed3c6b3befe933 Mon Sep 17 00:00:00 2001 From: yipechai Date: Sat, 29 Jan 2022 17:09:08 +0800 Subject: drm/amdgpu: Fixed the defect of soft lock caused by infinite loop 1. The infinite loop case only occurs on multiple cards support ras functions. 2. The explanation of root cause refer to commit 76641cbbf196 ("drm/amdgpu: Add judgement to avoid infinite loop"). 3. Create new node to manage each unique ras instance to guarantee each device .ras_list is completely independent. 4. Fixes: commit 7a6b8ab3231b51 ("drm/amdgpu: Unify ras block interface for each ras block"). 5. The soft locked logs are as follows: [ 262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G OE 5.13.0-27-generic #29~20.04.1-Ubuntu [ 262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS T20200717143848 07/17/2020 [ 262.165698] Workqueue: events amdgpu_ras_do_recovery [amdgpu] [ 262.165980] RIP: 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [ 262.166239] Code: 68 d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 48 39 d3 74 25 49 89 c6 49 8b 45 [ 262.166243] RSP: 0018:ffffac908fa87d80 EFLAGS: 00000202 [ 262.166247] RAX: ffffffffc1394248 RBX: ffff91e4ab8d6e20 RCX: ffffffffc1394248 [ 262.166249] RDX: ffff91e4aa356e20 RSI: 000000000000000e RDI: ffff91e4ab8c0000 [ 262.166252] RBP: ffffac908fa87da8 R08: 0000000000000007 R09: 0000000000000001 [ 262.166254] R10: ffff91e4930b64ec R11: 0000000000000000 R12: 000000000000000e [ 262.166256] R13: ffff91e4aa356df8 R14: ffffffffc1394320 R15: 0000000000000003 [ 262.166258] FS: 0000000000000000(0000) GS:ffff92238fb40000(0000) knlGS:0000000000000000 [ 262.166261] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 262.166264] CR2: 00000001004865d0 CR3: 000000406d796000 CR4: 0000000000350ee0 [ 262.166267] Call Trace: [ 262.166272] amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [ 262.166529] ? psi_task_switch+0xd2/0x250 [ 262.166537] ? __switch_to+0x11d/0x460 [ 262.166542] ? __switch_to_asm+0x36/0x70 [ 262.166549] process_one_work+0x220/0x3c0 [ 262.166556] worker_thread+0x4d/0x3f0 [ 262.166560] ? process_one_work+0x3c0/0x3c0 [ 262.166563] kthread+0x12b/0x150 [ 262.166568] ? set_kthread_struct+0x40/0x40 [ 262.166571] ret_from_fork+0x22/0x30 Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 35 +++++++++++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 --- 2 files changed, 31 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e440a5268ace..d6608ae3c127 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = { "mca_iohc", }; +struct amdgpu_ras_block_list { + /* ras block link */ + struct list_head node; + + struct amdgpu_ras_block_object *ras_obj; +}; + const char *get_ras_block_str(struct ras_common_if *ras_block) { if (!ras_block) @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de enum amdgpu_ras_block block, uint32_t sub_block_index) { int loop_cnt = 0; - struct amdgpu_ras_block_object *obj, *tmp; + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; if (block >= AMDGPU_RAS_BLOCK__LAST) return NULL; @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_de if (!amdgpu_ras_is_supported(adev, block)) return NULL; - list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) { + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + + obj = node->ras_obj; if (obj->ras_block_match) { if (obj->ras_block_match(obj, block, sub_block_index) == 0) return obj; @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) int amdgpu_ras_fini(struct amdgpu_device *adev) { + struct amdgpu_ras_block_list *ras_node, *tmp; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); if (!adev->ras_enabled || !con) @@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) amdgpu_ras_set_context(adev, NULL); kfree(con); + /* Clear ras blocks from ras_list and free ras block list node */ + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { + list_del(&ras_node->node); + kfree(ras_node); + } + return 0; } @@ -2754,14 +2775,20 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj) { + struct amdgpu_ras_block_list *ras_node; if (!adev || !ras_block_obj) return -EINVAL; if (!amdgpu_ras_asic_supported(adev)) return 0; - INIT_LIST_HEAD(&ras_block_obj->node); - list_add_tail(&ras_block_obj->node, &adev->ras_list); + ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); + if (!ras_node) + return -ENOMEM; + + INIT_LIST_HEAD(&ras_node->node); + ras_node->ras_obj = ras_block_obj; + list_add_tail(&ras_node->node, &adev->ras_list); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index a51a281bd91a..a55743b12d57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -493,9 +493,6 @@ struct amdgpu_ras_block_object { uint32_t sub_block_index; - /* ras block link */ - struct list_head node; - int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); -- cgit From bdb3489cfca16815e9a737359e9e90a4af5d0ff3 Mon Sep 17 00:00:00 2001 From: yipechai Date: Sun, 30 Jan 2022 17:03:32 +0800 Subject: drm/amdgpu: Optimize xxx_ras_late_init/xxx_ras_late_fini for each ras block 1. Define amdgpu_ras_block_late_init to create sysfs nodes and interrupt handles. 2. Define amdgpu_ras_block_late_fini to remove sysfs nodes and interrupt handles. 3. Replace ras block variable members in struct amdgpu_ras_block_object with struct ras_common_if, which can make it easy to associate each ras block instance with each ras block functional interface. 4. Add .ras_cb to struct amdgpu_ras_block_object. 5. Change each ras block to fit for the changement of struct amdgpu_ras_block_object. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 7 ++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 35 +++++++++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 ++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 28 +++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- 11 files changed, 86 insertions(+), 37 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 52a60c2316a2..ad057d6b2c77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -83,14 +83,15 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev, .sysfs_name = sysfs_name, }; - snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", mca_dev->ras->ras_block.name); + snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", + mca_dev->ras->ras_block.ras_comm.name); if (!mca_dev->ras_if) { mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); if (!mca_dev->ras_if) return -ENOMEM; - mca_dev->ras_if->block = mca_dev->ras->ras_block.block; - mca_dev->ras_if->sub_block_index = mca_dev->ras->ras_block.sub_block_index; + mca_dev->ras_if->block = mca_dev->ras->ras_block.ras_comm.block; + mca_dev->ras_if->sub_block_index = mca_dev->ras->ras_block.ras_comm.sub_block_index; mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; } ih_info.head = fs_info.head = *mca_dev->ras_if; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a9c133a09be5..877b0e023648 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -877,7 +877,7 @@ static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_ if (!block_obj) return -EINVAL; - if (block_obj->block == block) + if (block_obj->ras_comm.block == block) return 0; return -EINVAL; @@ -2457,6 +2457,23 @@ interrupt: return r; } +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + char sysfs_name[32]; + struct ras_ih_if ih_info; + struct ras_fs_if fs_info; + struct amdgpu_ras_block_object *obj; + + obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + ih_info.cb = obj->ras_cb; + ih_info.head = *ras_block; + snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", ras_block->name); + fs_info.sysfs_name = (const char *)sysfs_name; + fs_info.head = *ras_block; + return amdgpu_ras_late_init(adev, ras_block, &fs_info, &ih_info); +} + /* helper function to remove ras fs node and interrupt handler */ void amdgpu_ras_late_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block, @@ -2470,6 +2487,22 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev, amdgpu_ras_interrupt_remove_handler(adev, ih_info); } +void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + struct ras_ih_if ih_info; + struct amdgpu_ras_block_object *obj; + + if (!ras_block) + return; + + obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + ih_info.head = *ras_block; + ih_info.cb = obj->ras_cb; + + amdgpu_ras_late_fini(adev, ras_block, &ih_info); +} + /* do some init work after IP late init as dependence. * and it runs in resume/gpu reset/booting up cases. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index a55743b12d57..8b94b556baf6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -486,17 +486,13 @@ struct ras_debug_if { }; struct amdgpu_ras_block_object { - /* block name */ - char name[32]; - - enum amdgpu_ras_block block; - - uint32_t sub_block_index; + struct ras_common_if ras_comm; int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); void (*ras_fini)(struct amdgpu_device *adev); + ras_ih_cb ras_cb; const struct amdgpu_ras_block_hw_ops *hw_ops; }; @@ -605,10 +601,17 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, struct ras_fs_if *fs_info, struct ras_ih_if *ih_info); + +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block); + void amdgpu_ras_late_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block, struct ras_ih_if *ih_info); +void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block); + int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 5929d6f528c9..15707af89212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -981,8 +981,10 @@ struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { struct amdgpu_xgmi_ras xgmi_ras = { .ras_block = { - .name = "xgmi", - .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + .ras_comm = { + .name = "xgmi", + .block = AMDGPU_RAS_BLOCK__XGMI_WAFL, + }, .hw_ops = &xgmi_ras_hw_ops, .ras_late_init = amdgpu_xgmi_ras_late_init, .ras_fini = amdgpu_xgmi_ras_fini, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 744253be5142..1405f8145789 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2195,8 +2195,8 @@ static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) return err; } - strcpy(adev->gfx.ras->ras_block.name,"gfx"); - adev->gfx.ras->ras_block.block = AMDGPU_RAS_BLOCK__GFX; + strcpy(adev->gfx.ras->ras_block.ras_comm.name, "gfx"); + adev->gfx.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX; /* If not define special ras_late_init function, use gfx default ras_late_init */ if (!adev->gfx.ras->ras_block.ras_late_init) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index c64e3a391c99..80f6a29ed30c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -672,8 +672,8 @@ static void gmc_v10_0_set_umc_funcs(struct amdgpu_device *adev) if (adev->umc.ras) { amdgpu_ras_register_ras_block(adev, &adev->umc.ras->ras_block); - strcpy(adev->umc.ras->ras_block.name, "umc"); - adev->umc.ras->ras_block.block = AMDGPU_RAS_BLOCK__UMC; + strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); + adev->umc.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; /* If don't define special ras_late_init function, use default ras_late_init */ if (!adev->umc.ras->ras_block.ras_late_init) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 4595027a8c63..af873c99d5e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1232,8 +1232,8 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) if (adev->umc.ras) { amdgpu_ras_register_ras_block(adev, &adev->umc.ras->ras_block); - strcpy(adev->umc.ras->ras_block.name, "umc"); - adev->umc.ras->ras_block.block = AMDGPU_RAS_BLOCK__UMC; + strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); + adev->umc.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; /* If don't define special ras_late_init function, use default ras_late_init */ if (!adev->umc.ras->ras_block.ras_late_init) @@ -1280,8 +1280,8 @@ static void gmc_v9_0_set_mmhub_ras_funcs(struct amdgpu_device *adev) if (adev->mmhub.ras) { amdgpu_ras_register_ras_block(adev, &adev->mmhub.ras->ras_block); - strcpy(adev->mmhub.ras->ras_block.name,"mmhub"); - adev->mmhub.ras->ras_block.block = AMDGPU_RAS_BLOCK__MMHUB; + strcpy(adev->mmhub.ras->ras_block.ras_comm.name, "mmhub"); + adev->mmhub.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MMHUB; /* If don't define special ras_late_init function, use default ras_late_init */ if (!adev->mmhub.ras->ras_block.ras_late_init) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c index 6b41fcbf4875..503c292b321e 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c @@ -157,8 +157,10 @@ struct amdgpu_ras_block_hw_ops hdp_v4_0_ras_hw_ops = { struct amdgpu_hdp_ras hdp_v4_0_ras = { .ras_block = { - .name = "hdp", - .block = AMDGPU_RAS_BLOCK__HDP, + .ras_comm = { + .name = "hdp", + .block = AMDGPU_RAS_BLOCK__HDP, + }, .hw_ops = &hdp_v4_0_ras_hw_ops, .ras_late_init = amdgpu_hdp_ras_late_init, .ras_fini = amdgpu_hdp_ras_fini, diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c index 68565262af9c..386416378a82 100644 --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c @@ -53,8 +53,8 @@ static int mca_v3_0_ras_block_match(struct amdgpu_ras_block_object *block_obj, if (!block_obj) return -EINVAL; - if ((block_obj->block == block) && - (block_obj->sub_block_index == sub_block_index)) { + if ((block_obj->ras_comm.block == block) && + (block_obj->ras_comm.sub_block_index == sub_block_index)) { return 0; } @@ -68,9 +68,11 @@ const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = { struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = { .ras_block = { - .block = AMDGPU_RAS_BLOCK__MCA, - .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP0, - .name = "mp0", + .ras_comm = { + .block = AMDGPU_RAS_BLOCK__MCA, + .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP0, + .name = "mp0", + }, .hw_ops = &mca_v3_0_mp0_hw_ops, .ras_block_match = mca_v3_0_ras_block_match, .ras_late_init = mca_v3_0_mp0_ras_late_init, @@ -103,9 +105,11 @@ const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = { struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = { .ras_block = { - .block = AMDGPU_RAS_BLOCK__MCA, - .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP1, - .name = "mp1", + .ras_comm = { + .block = AMDGPU_RAS_BLOCK__MCA, + .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MP1, + .name = "mp1", + }, .hw_ops = &mca_v3_0_mp1_hw_ops, .ras_block_match = mca_v3_0_ras_block_match, .ras_late_init = mca_v3_0_mp1_ras_late_init, @@ -138,9 +142,11 @@ const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = { struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = { .ras_block = { - .block = AMDGPU_RAS_BLOCK__MCA, - .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MPIO, - .name = "mpio", + .ras_comm = { + .block = AMDGPU_RAS_BLOCK__MCA, + .sub_block_index = AMDGPU_RAS_MCA_BLOCK__MPIO, + .name = "mpio", + }, .hw_ops = &mca_v3_0_mpio_hw_ops, .ras_block_match = mca_v3_0_ras_block_match, .ras_late_init = mca_v3_0_mpio_ras_late_init, diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 39974b449341..c7cca87f1647 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -664,8 +664,10 @@ const struct amdgpu_ras_block_hw_ops nbio_v7_4_ras_hw_ops = { struct amdgpu_nbio_ras nbio_v7_4_ras = { .ras_block = { - .name = "pcie_bif", - .block = AMDGPU_RAS_BLOCK__PCIE_BIF, + .ras_comm = { + .name = "pcie_bif", + .block = AMDGPU_RAS_BLOCK__PCIE_BIF, + }, .hw_ops = &nbio_v7_4_ras_hw_ops, .ras_late_init = amdgpu_nbio_ras_late_init, .ras_fini = amdgpu_nbio_ras_fini, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 02115d63b071..f07cb6a256c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -2822,8 +2822,8 @@ static void sdma_v4_0_set_ras_funcs(struct amdgpu_device *adev) if (adev->sdma.ras) { amdgpu_ras_register_ras_block(adev, &adev->sdma.ras->ras_block); - strcpy(adev->sdma.ras->ras_block.name, "sdma"); - adev->sdma.ras->ras_block.block = AMDGPU_RAS_BLOCK__SDMA; + strcpy(adev->sdma.ras->ras_block.ras_comm.name, "sdma"); + adev->sdma.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__SDMA; /* If don't define special ras_late_init function, use default ras_late_init */ if (!adev->sdma.ras->ras_block.ras_late_init) -- cgit From 9252d33df597a60416f3718b9b41457657c8540c Mon Sep 17 00:00:00 2001 From: yipechai Date: Wed, 9 Feb 2022 11:05:27 +0800 Subject: drm/amdgpu: Optimize operating sysfs and interrupt function interface in amdgpu_ras.c In order to reduce redundant struct conversion, modify operating sysfs and interrupt function interface parameters. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++----------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +++--- 2 files changed, 21 insertions(+), 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 75f5fda357ac..9fb944cbc755 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1276,18 +1276,17 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) } int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, - struct ras_fs_if *head) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); if (!obj || obj->attr_inuse) return -EINVAL; get_obj(obj); - memcpy(obj->fs_data.sysfs_name, - head->sysfs_name, - sizeof(obj->fs_data.sysfs_name)); + snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), + "%s_err_count", head->name); obj->sysfs_attr = (struct device_attribute){ .attr = { @@ -1594,9 +1593,9 @@ int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; if (!obj) @@ -1616,24 +1615,27 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, } int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, - struct ras_ih_if *info) + struct ras_common_if *head) { - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); struct ras_ih_data *data; + struct amdgpu_ras_block_object *ras_obj; if (!obj) { /* in case we registe the IH before enable ras feature */ - obj = amdgpu_ras_create_obj(adev, &info->head); + obj = amdgpu_ras_create_obj(adev, head); if (!obj) return -EINVAL; } else get_obj(obj); + ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm); + data = &obj->ih_data; /* add the callback.etc */ *data = (struct ras_ih_data) { .inuse = 0, - .cb = info->cb, + .cb = ras_obj->ras_cb, .element_size = sizeof(struct amdgpu_iv_entry), .rptr = 0, .wptr = 0, @@ -1662,10 +1664,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) struct ras_manager *obj, *tmp; list_for_each_entry_safe(obj, tmp, &con->head, node) { - struct ras_ih_if info = { - .head = obj->head, - }; - amdgpu_ras_interrupt_remove_handler(adev, &info); + amdgpu_ras_interrupt_remove_handler(adev, &obj->head); } return 0; @@ -2431,12 +2430,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, return 0; if (ih_info->cb) { - r = amdgpu_ras_interrupt_add_handler(adev, ih_info); + r = amdgpu_ras_interrupt_add_handler(adev, ras_block); if (r) goto interrupt; } - r = amdgpu_ras_sysfs_create(adev, fs_info); + r = amdgpu_ras_sysfs_create(adev, ras_block); if (r) goto sysfs; @@ -2452,7 +2451,7 @@ cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); sysfs: if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_interrupt_remove_handler(adev, ras_block); interrupt: amdgpu_ras_feature_enable(adev, ras_block, 0); return r; @@ -2485,7 +2484,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev, amdgpu_ras_sysfs_remove(adev, ras_block); if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_interrupt_remove_handler(adev, &ih_info->head); } void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 8b94b556baf6..ae8741ac526f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -619,7 +619,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, struct ras_common_if *head, bool enable); int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, - struct ras_fs_if *head); + struct ras_common_if *head); int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, struct ras_common_if *head); @@ -636,10 +636,10 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, struct ras_inject_if *info); int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, - struct ras_ih_if *info); + struct ras_common_if *head); int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, - struct ras_ih_if *info); + struct ras_common_if *head); int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info); -- cgit From 563285c85ecaa1fcecf304dabf87cbeee1ddbc3f Mon Sep 17 00:00:00 2001 From: yipechai Date: Wed, 9 Feb 2022 11:15:29 +0800 Subject: drm/amdgpu: Merge amdgpu_ras_late_init/amdgpu_ras_late_fini to amdgpu_ras_block_late_init/amdgpu_ras_block_late_fini 1. Merge amdgpu_ras_late_init to amdgpu_ras_block_late_init. 2. Remove amdgpu_ras_late_init since no ras block calls amdgpu_ras_late_init. 3. Merge amdgpu_ras_late_fini to amdgpu_ras_block_late_fini. 4. Remove amdgpu_ras_late_fini since no ras block calls amdgpu_ras_late_fini. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 53 +++++++-------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 ----- 2 files changed, 11 insertions(+), 50 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9fb944cbc755..b5cd21cb6e58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2397,11 +2397,10 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) } /* helper function to handle common stuff in ip late init phase */ -int amdgpu_ras_late_init(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_fs_if *fs_info, - struct ras_ih_if *ih_info) +int amdgpu_ras_block_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block) { + struct amdgpu_ras_block_object *ras_obj; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); unsigned long ue_count, ce_count; int r; @@ -2429,7 +2428,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; - if (ih_info->cb) { + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) { r = amdgpu_ras_interrupt_add_handler(adev, ras_block); if (r) goto interrupt; @@ -2450,57 +2450,26 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, cleanup: amdgpu_ras_sysfs_remove(adev, ras_block); sysfs: - if (ih_info->cb) + if (ras_obj->ras_cb) amdgpu_ras_interrupt_remove_handler(adev, ras_block); interrupt: amdgpu_ras_feature_enable(adev, ras_block, 0); return r; } -int amdgpu_ras_block_late_init(struct amdgpu_device *adev, - struct ras_common_if *ras_block) -{ - char sysfs_name[32]; - struct ras_ih_if ih_info; - struct ras_fs_if fs_info; - struct amdgpu_ras_block_object *obj; - - obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); - ih_info.cb = obj->ras_cb; - ih_info.head = *ras_block; - snprintf(sysfs_name, sizeof(sysfs_name), "%s_err_count", ras_block->name); - fs_info.sysfs_name = (const char *)sysfs_name; - fs_info.head = *ras_block; - return amdgpu_ras_late_init(adev, ras_block, &fs_info, &ih_info); -} - /* helper function to remove ras fs node and interrupt handler */ -void amdgpu_ras_late_fini(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_ih_if *ih_info) -{ - if (!ras_block || !ih_info) - return; - - amdgpu_ras_sysfs_remove(adev, ras_block); - if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, &ih_info->head); -} - void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { - struct ras_ih_if ih_info; - struct amdgpu_ras_block_object *obj; - + struct amdgpu_ras_block_object *ras_obj; if (!ras_block) return; - obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); - ih_info.head = *ras_block; - ih_info.cb = obj->ras_cb; + amdgpu_ras_sysfs_remove(adev, ras_block); - amdgpu_ras_late_fini(adev, ras_block, &ih_info); + ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm); + if (ras_obj->ras_cb) + amdgpu_ras_interrupt_remove_handler(adev, ras_block); } /* do some init work after IP late init as dependence. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ae8741ac526f..5de567c6a8f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -597,18 +597,10 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { int amdgpu_ras_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); -int amdgpu_ras_late_init(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_fs_if *fs_info, - struct ras_ih_if *ih_info); int amdgpu_ras_block_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_ras_late_fini(struct amdgpu_device *adev, - struct ras_common_if *ras_block, - struct ras_ih_if *ih_info); - void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); -- cgit From 4e9b1fa5a2757d11a5c40eed2b2b4837dcb2f12e Mon Sep 17 00:00:00 2001 From: yipechai Date: Mon, 14 Feb 2022 14:12:55 +0800 Subject: drm/amdgpu: Modify .ras_late_init function pointer parameter Modify .ras_late_init function pointer parameter so that it can remove redundant intermediate calls in some ras blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 6 +++--- 15 files changed, 17 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index fe392108b5c2..b7470ed7bc25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -622,7 +622,7 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value) return r; } -int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info) +int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; r = amdgpu_ras_block_late_init(adev, adev->gfx.ras_if); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index f99eac544f6d..ccca0a85b982 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -386,7 +386,7 @@ bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me, int pipe, int queue); void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value); -int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, void *ras_info); +int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); void amdgpu_gfx_ras_fini(struct amdgpu_device *adev); int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c index 21a5f884dd2a..70a096160998 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c @@ -24,7 +24,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" -int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info) +int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { return amdgpu_ras_block_late_init(adev, adev->hdp.ras_if); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h index 4af2c2a322e7..aabd59aa5213 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h @@ -43,6 +43,6 @@ struct amdgpu_hdp { struct amdgpu_hdp_ras *ras; }; -int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info); +int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); void amdgpu_hdp_ras_fini(struct amdgpu_device *adev); #endif /* __AMDGPU_HDP_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c index 2bdb4d8b7955..ede98db8c126 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c @@ -24,7 +24,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" -int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev, void *ras_info) +int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { return amdgpu_ras_block_late_init(adev, adev->mmhub.ras_if); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 7deda9a3b81e..75815106f2d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -47,7 +47,7 @@ struct amdgpu_mmhub { struct amdgpu_mmhub_ras *ras; }; -int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev, void *ras_info); +int amdgpu_mmhub_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c index 89e61fdd3580..92fd4ffa7779 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c @@ -22,7 +22,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" -int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, void *ras_info) +int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; r = amdgpu_ras_block_late_init(adev, adev->nbio.ras_if); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index 4afb76d3cd97..f9546c7341b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -104,6 +104,6 @@ struct amdgpu_nbio { struct amdgpu_nbio_ras *ras; }; -int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, void *ras_info); +int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); void amdgpu_nbio_ras_fini(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 5de567c6a8f7..837d1b79a9cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -490,7 +490,7 @@ struct amdgpu_ras_block_object { int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); - int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); + int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); void (*ras_fini)(struct amdgpu_device *adev); ras_ih_cb ras_cb; const struct amdgpu_ras_block_hw_ops *hw_ops; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 242a7b4dcad9..594454dba4c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -87,7 +87,7 @@ uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, } int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev, - void *ras_ih_info) + struct ras_common_if *ras_block) { int r, i; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index eaee12ab6518..8b226ffee32c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -117,7 +117,7 @@ amdgpu_sdma_get_instance_from_ring(struct amdgpu_ring *ring); int amdgpu_sdma_get_index_from_ring(struct amdgpu_ring *ring, uint32_t *index); uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, unsigned vmid); int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev, - void *ras_ih_info); + struct ras_common_if *ras_block); void amdgpu_sdma_ras_fini(struct amdgpu_device *adev); int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9f1406e1a48a..7abf9299e0d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -136,7 +136,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); } -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, void *ras_info) +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { int r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index ec15b3640399..e4b3678a6685 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -72,7 +72,7 @@ struct amdgpu_umc { struct amdgpu_umc_ras *ras; }; -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, void *ras_info); +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); void amdgpu_umc_ras_fini(struct amdgpu_device *adev); int amdgpu_umc_poison_handler(struct amdgpu_device *adev, void *ras_error_status, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index a785b1e088cd..91f788f6f6b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -732,7 +732,7 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return psp_xgmi_terminate(&adev->psp); } -static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, void *ras_info) +static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c index c442b34b9472..72ce19acb8bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c @@ -37,7 +37,7 @@ static void mca_v3_0_mp0_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static int mca_v3_0_mp0_ras_late_init(struct amdgpu_device *adev, void *ras_info) +static int mca_v3_0_mp0_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { return amdgpu_mca_ras_late_init(adev, &adev->mca.mp0); } @@ -89,7 +89,7 @@ static void mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static int mca_v3_0_mp1_ras_late_init(struct amdgpu_device *adev, void *ras_info) +static int mca_v3_0_mp1_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { return amdgpu_mca_ras_late_init(adev, &adev->mca.mp1); } @@ -127,7 +127,7 @@ static void mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static int mca_v3_0_mpio_ras_late_init(struct amdgpu_device *adev, void *ras_info) +static int mca_v3_0_mpio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { return amdgpu_mca_ras_late_init(adev, &adev->mca.mpio); } -- cgit From 867e24ca4945249baf34ea07ae6b27ca927210a1 Mon Sep 17 00:00:00 2001 From: yipechai Date: Mon, 14 Feb 2022 14:53:37 +0800 Subject: drm/amdgpu: define amdgpu_ras_late_init to call all ras blocks' .ras_late_init Define amdgpu_ras_late_init to call all ras blocks' .ras_late_init. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 44 ------------------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 ---- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 5 +--- drivers/gpu/drm/amd/amdgpu/soc15.c | 6 +--- 7 files changed, 34 insertions(+), 59 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6d4d59458cff..5b772f2af6eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2629,6 +2629,12 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + r = amdgpu_ras_late_init(adev); + if (r) { + DRM_ERROR("amdgpu_ras_late_init failed %d", r); + return r; + } + amdgpu_ras_set_error_query_ready(adev, true); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 7bc68c94e0b8..a71688d1c1b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -449,50 +449,6 @@ int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev) { - int r; - - if (adev->umc.ras && adev->umc.ras->ras_block.ras_late_init) { - r = adev->umc.ras->ras_block.ras_late_init(adev, adev->umc.ras_if); - if (r) - return r; - } - - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_late_init) { - r = adev->mmhub.ras->ras_block.ras_late_init(adev, adev->mmhub.ras_if); - if (r) - return r; - } - - if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) { - r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, adev->gmc.xgmi.ras_if); - if (r) - return r; - } - - if (adev->hdp.ras && adev->hdp.ras->ras_block.ras_late_init) { - r = adev->hdp.ras->ras_block.ras_late_init(adev, adev->hdp.ras_if); - if (r) - return r; - } - - if (adev->mca.mp0.ras && adev->mca.mp0.ras->ras_block.ras_late_init) { - r = adev->mca.mp0.ras->ras_block.ras_late_init(adev, adev->mca.mp0.ras_if); - if (r) - return r; - } - - if (adev->mca.mp1.ras && adev->mca.mp1.ras->ras_block.ras_late_init) { - r = adev->mca.mp1.ras->ras_block.ras_late_init(adev, adev->mca.mp1.ras_if); - if (r) - return r; - } - - if (adev->mca.mpio.ras && adev->mca.mpio.ras->ras_block.ras_late_init) { - r = adev->mca.mpio.ras->ras_block.ras_late_init(adev, adev->mca.mpio.ras_if); - if (r) - return r; - } - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b5cd21cb6e58..73e8d6f70541 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2522,6 +2522,31 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) amdgpu_ras_disable_all_features(adev, 1); } +int amdgpu_ras_late_init(struct amdgpu_device *adev) +{ + struct amdgpu_ras_block_list *node, *tmp; + struct amdgpu_ras_block_object *obj; + int r; + + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { + if (!node->ras_obj) { + dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); + continue; + } + obj = node->ras_obj; + if (obj->ras_late_init) { + r = obj->ras_late_init(adev, &obj->ras_comm); + if (r) { + dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", + obj->ras_comm.name, r); + return r; + } + } + } + + return 0; +} + /* do some fini work before IP fini as dependence */ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 837d1b79a9cb..143a83043d7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -595,6 +595,7 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { /* called in ip_init and ip_fini */ int amdgpu_ras_init(struct amdgpu_device *adev); +int amdgpu_ras_late_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index bb40ab83fc22..1997f129db9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4791,12 +4791,6 @@ static int gfx_v9_0_ecc_late_init(void *handle) if (r) return r; - if (adev->gfx.ras && adev->gfx.ras->ras_block.ras_late_init) { - r = adev->gfx.ras->ras_block.ras_late_init(adev, adev->gfx.ras_if); - if (r) - return r; - } - if (adev->gfx.ras && adev->gfx.ras->enable_watchdog_timer) adev->gfx.ras->enable_watchdog_timer(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index af5a1c93861b..e26c39fcd336 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1894,10 +1894,7 @@ static int sdma_v4_0_late_init(void *handle) adev->sdma.ras->ras_block.hw_ops->reset_ras_error_count(adev); } - if (adev->sdma.ras && adev->sdma.ras->ras_block.ras_late_init) - return adev->sdma.ras->ras_block.ras_late_init(adev, adev->sdma.ras_if); - else - return 0; + return 0; } static int sdma_v4_0_sw_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 99a88f4d8050..25e1ee2bd2f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1189,15 +1189,11 @@ static int soc15_common_early_init(void *handle) static int soc15_common_late_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - int r = 0; if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_get_irq(adev); - if (adev->nbio.ras && adev->nbio.ras->ras_block.ras_late_init) - r = adev->nbio.ras->ras_block.ras_late_init(adev, adev->nbio.ras_if); - - return r; + return 0; } static int soc15_common_sw_init(void *handle) -- cgit From 01d468d9a420152e4a1270992e69a37ea0c98e04 Mon Sep 17 00:00:00 2001 From: yipechai Date: Thu, 17 Feb 2022 14:52:20 +0800 Subject: drm/amdgpu: Modify .ras_fini function pointer parameter Modify .ras_fini function pointer parameter so that we can remove redundant intermediate calls in some ras blocks. Signed-off-by: yipechai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/soc15.c | 2 +- 19 files changed, 24 insertions(+), 24 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 52912b6bcb20..d020c4599433 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -644,7 +644,7 @@ late_fini: return r; } -void amdgpu_gfx_ras_fini(struct amdgpu_device *adev) +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX) && adev->gfx.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index ccca0a85b982..f7c50ab4589c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -387,7 +387,7 @@ bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me, void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t *value); int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_gfx_ras_fini(struct amdgpu_device *adev); +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 78498d1d1769..350fbfec1e03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -455,16 +455,16 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev) void amdgpu_gmc_ras_fini(struct amdgpu_device *adev) { if (adev->umc.ras && adev->umc.ras->ras_block.ras_fini) - adev->umc.ras->ras_block.ras_fini(adev); + adev->umc.ras->ras_block.ras_fini(adev, NULL); if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_fini) - adev->mmhub.ras->ras_block.ras_fini(adev); + adev->mmhub.ras->ras_block.ras_fini(adev, NULL); if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_fini) - adev->gmc.xgmi.ras->ras_block.ras_fini(adev); + adev->gmc.xgmi.ras->ras_block.ras_fini(adev, NULL); if (adev->hdp.ras && adev->hdp.ras->ras_block.ras_fini) - adev->hdp.ras->ras_block.ras_fini(adev); + adev->hdp.ras->ras_block.ras_fini(adev, NULL); } /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c index b7fbc114a175..0f224e21cd55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c @@ -24,7 +24,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" -void amdgpu_hdp_ras_fini(struct amdgpu_device *adev) +void amdgpu_hdp_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP) && adev->hdp.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h index aabd59aa5213..c05cd992ef8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h @@ -44,5 +44,5 @@ struct amdgpu_hdp { }; int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_hdp_ras_fini(struct amdgpu_device *adev); +void amdgpu_hdp_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); #endif /* __AMDGPU_HDP_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c index 42413813765a..6dfcedcc37fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c @@ -24,7 +24,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" -void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev) +void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB) && adev->mmhub.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 240b26d9a388..253f047379cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -47,6 +47,6 @@ struct amdgpu_mmhub { struct amdgpu_mmhub_ras *ras; }; -void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev); +void amdgpu_mmhub_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c index f09ad80f0772..0de2fdf31eed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c @@ -44,7 +44,7 @@ late_fini: return r; } -void amdgpu_nbio_ras_fini(struct amdgpu_device *adev) +void amdgpu_nbio_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF) && adev->nbio.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h index f9546c7341b8..3222e1cae134 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h @@ -105,5 +105,5 @@ struct amdgpu_nbio { }; int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_nbio_ras_fini(struct amdgpu_device *adev); +void amdgpu_nbio_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 143a83043d7c..7cddaad90d6d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -491,7 +491,7 @@ struct amdgpu_ras_block_object { int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, enum amdgpu_ras_block block, uint32_t sub_block_index); int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block); - void (*ras_fini)(struct amdgpu_device *adev); + void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block); ras_ih_cb ras_cb; const struct amdgpu_ras_block_hw_ops *hw_ops; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 3b5c43575aa3..863035a94bd8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -111,7 +111,7 @@ late_fini: return r; } -void amdgpu_sdma_ras_fini(struct amdgpu_device *adev) +void amdgpu_sdma_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA) && adev->sdma.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 8b226ffee32c..34ec60dfe5e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -118,7 +118,7 @@ int amdgpu_sdma_get_index_from_ring(struct amdgpu_ring *ring, uint32_t *index); uint64_t amdgpu_sdma_get_csa_mc_addr(struct amdgpu_ring *ring, unsigned vmid); int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_sdma_ras_fini(struct amdgpu_device *adev); +void amdgpu_sdma_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, void *err_data, struct amdgpu_iv_entry *entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 9400260e3263..41e976733c57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -162,7 +162,7 @@ late_fini: return r; } -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) +void amdgpu_umc_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) && adev->umc.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index e4b3678a6685..c8deba8dacb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -73,7 +73,7 @@ struct amdgpu_umc { }; int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); -void amdgpu_umc_ras_fini(struct amdgpu_device *adev); +void amdgpu_umc_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev, void *ras_error_status, bool reset); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 91817a31f3e1..3e56adea84a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -768,7 +768,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm return amdgpu_ras_block_late_init(adev, ras_block); } -static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev) +static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) && adev->gmc.xgmi.ras_if) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 1997f129db9c..3ecb238c6483 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2433,7 +2433,7 @@ static int gfx_v9_0_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; if (adev->gfx.ras && adev->gfx.ras->ras_block.ras_fini) - adev->gfx.ras->ras_block.ras_fini(adev); + adev->gfx.ras->ras_block.ras_fini(adev, NULL); for (i = 0; i < adev->gfx.num_gfx_rings; i++) amdgpu_ring_fini(&adev->gfx.gfx_ring[i]); diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c index b4b36899f5c6..02c50be19d3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c @@ -37,7 +37,7 @@ static void mca_v3_0_mp0_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static void mca_v3_0_mp0_ras_fini(struct amdgpu_device *adev) +static void mca_v3_0_mp0_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { amdgpu_mca_ras_fini(adev, &adev->mca.mp0); } @@ -83,7 +83,7 @@ static void mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static void mca_v3_0_mp1_ras_fini(struct amdgpu_device *adev) +static void mca_v3_0_mp1_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { amdgpu_mca_ras_fini(adev, &adev->mca.mp1); } @@ -115,7 +115,7 @@ static void mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev, ras_error_status); } -static void mca_v3_0_mpio_ras_fini(struct amdgpu_device *adev) +static void mca_v3_0_mpio_ras_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block) { amdgpu_mca_ras_fini(adev, &adev->mca.mpio); } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index e26c39fcd336..d0a9012e53d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1997,7 +1997,7 @@ static int sdma_v4_0_sw_fini(void *handle) if (adev->sdma.ras && adev->sdma.ras->ras_block.hw_ops && adev->sdma.ras->ras_block.ras_fini) - adev->sdma.ras->ras_block.ras_fini(adev); + adev->sdma.ras->ras_block.ras_fini(adev, NULL); for (i = 0; i < adev->sdma.num_instances; i++) { amdgpu_ring_fini(&adev->sdma.instance[i].ring); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index b7c24149a6bd..34cd5cad7da5 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1215,7 +1215,7 @@ static int soc15_common_sw_fini(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; if (adev->nbio.ras && adev->nbio.ras->ras_block.ras_fini) - adev->nbio.ras->ras_block.ras_fini(adev); + adev->nbio.ras->ras_block.ras_fini(adev, NULL); if (adev->df.funcs && adev->df.funcs->sw_fini) -- cgit From 69691c823531c36c7283ecaa040e99e9c12ece07 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 3 Mar 2022 17:56:33 +0800 Subject: drm/amdgpu: message smu to update bad channel info It should notice SMU to update bad channel info when detected uncorrectable error in UMC block Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 5 +++++ 5 files changed, 42 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d78c2970e558..424c22a841f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); @@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) goto free; amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } #ifdef CONFIG_X86_MCE_AMD @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->update_channel_flag = false; con->features = 0; INIT_LIST_HEAD(&con->head); /* Might need get this flag from vbios. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7cddaad90d6d..9314fde81e68 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -374,6 +374,9 @@ struct amdgpu_ras { /* record umc error info queried from smu */ struct umc_ecc_info umc_ecc; + + /* Indicates smu whether need update bad channel info */ + bool update_channel_flag; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index a44f2eeed6ef..c4283987bb1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); u8 csum; int res; @@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); + control->bad_channel_bitmap = 0; + amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); + con->update_channel_flag = false; + amdgpu_ras_debugfs_set_ret_size(control); mutex_unlock(&control->ras_tbl_mutex); @@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, const u32 num) { + struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); u32 a, b, i; u8 *buf, *pp; int res; @@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, /* Encode all of them in one go. */ pp = buf; - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { __encode_table_record_to_buf(control, &record[i], pp); + /* update bad channel bitmap */ + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + con->update_channel_flag = true; + } + } + /* a, first record index to write into. * b, last record index to write into. * a = first index to read (fri) + number of records in the table, @@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int i, res; u8 *buf, *pp; u32 g0, g1; @@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, /* Read up everything? Then transform. */ pp = buf; - for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { __decode_table_record_from_buf(control, &record[i], pp); + + /* update bad channel bitmap */ + if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + con->update_channel_flag = true; + } + } Out: kfree(buf); mutex_unlock(&control->ras_tbl_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6bb00578bfbb..54d9bfe0881d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control { /* Protect table access via this mutex. */ struct mutex ras_tbl_mutex; + + /* Record channel info which occurred bad pages + */ + u32 bad_channel_bitmap; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 85da6cbaf3b7..aad3c8b4c810 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_ras_save_bad_pages(adev); amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } } if (reset) -- cgit From a3d63c62bdf06c5a3f8a71c207f13b26fc6030f5 Mon Sep 17 00:00:00 2001 From: Mohammad Zafar Ziya Date: Thu, 17 Mar 2022 09:45:42 +0800 Subject: drm/amdgpu: Add vcn and jpeg ras support flag Add vcn and jpeg ras support options V2: vcn and jpeg ras flag enabled for aldebaran asic only V3: vcn and jpeg ras flag disabled for error counter query Generic poison query interface added VCN and JPEG ras enabled based on IP version check V4: vcn and jpeg ras flag moved under ecc flag for dGPU Signed-off-by: Mohammad Zafar Ziya Reviewed-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ 2 files changed, 11 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 424c22a841f4..ec709997c9c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -66,6 +66,8 @@ const char *ras_block_string[] = { "mp1", "fuse", "mca", + "vcn", + "jpeg", }; const char *ras_mca_block_string[] = { @@ -2205,6 +2207,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) dev_info(adev->dev, "SRAM ECC is active.\n"); adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); + + if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 9314fde81e68..1e1a3b736859 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -49,6 +49,8 @@ enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__MP1, AMDGPU_RAS_BLOCK__FUSE, AMDGPU_RAS_BLOCK__MCA, + AMDGPU_RAS_BLOCK__VCN, + AMDGPU_RAS_BLOCK__JPEG, AMDGPU_RAS_BLOCK__LAST }; -- cgit From c543dcbe4237e03b23fa40e0fba979cfd8514954 Mon Sep 17 00:00:00 2001 From: Mohammad Zafar Ziya Date: Wed, 23 Mar 2022 12:06:49 +0800 Subject: drm/amdgpu/vcn: Add VCN ras error query support RAS error query support addition for VCN 2.6 V2: removed unused option and corrected comment format Moved the register definition under header file V3: poison query status check added. Removed error query interface V4: MMSCH poison check option removed, return true/false refactored. Signed-off-by: Mohammad Zafar Ziya Reviewed-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 71 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h | 6 +++ 3 files changed, 78 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 1e1a3b736859..606df8869b89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -508,6 +508,7 @@ struct amdgpu_ras_block_hw_ops { void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); void (*reset_ras_error_count)(struct amdgpu_device *adev); void (*reset_ras_error_status)(struct amdgpu_device *adev); + bool (*query_poison_status)(struct amdgpu_device *adev); }; /* work flow diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index 1bf672966a62..3e1de8c8b372 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -31,6 +31,7 @@ #include "soc15d.h" #include "vcn_v2_0.h" #include "mmsch_v1_0.h" +#include "vcn_v2_5.h" #include "vcn/vcn_2_5_offset.h" #include "vcn/vcn_2_5_sh_mask.h" @@ -59,6 +60,7 @@ static int vcn_v2_5_set_powergating_state(void *handle, static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev, int inst_idx, struct dpg_pause_state *new_state); static int vcn_v2_5_sriov_start(struct amdgpu_device *adev); +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev); static int amdgpu_ih_clientid_vcns[] = { SOC15_IH_CLIENTID_VCN, @@ -100,6 +102,7 @@ static int vcn_v2_5_early_init(void *handle) vcn_v2_5_set_dec_ring_funcs(adev); vcn_v2_5_set_enc_ring_funcs(adev); vcn_v2_5_set_irq_funcs(adev); + vcn_v2_5_set_ras_funcs(adev); return 0; } @@ -1932,3 +1935,71 @@ const struct amdgpu_ip_block_version vcn_v2_6_ip_block = .rev = 0, .funcs = &vcn_v2_6_ip_funcs, }; + +static uint32_t vcn_v2_6_query_poison_by_instance(struct amdgpu_device *adev, + uint32_t instance, uint32_t sub_block) +{ + uint32_t poison_stat = 0, reg_value = 0; + + switch (sub_block) { + case AMDGPU_VCN_V2_6_VCPU_VCODEC: + reg_value = RREG32_SOC15(VCN, instance, mmUVD_RAS_VCPU_VCODEC_STATUS); + poison_stat = REG_GET_FIELD(reg_value, UVD_RAS_VCPU_VCODEC_STATUS, POISONED_PF); + break; + default: + break; + }; + + if (poison_stat) + dev_info(adev->dev, "Poison detected in VCN%d, sub_block%d\n", + instance, sub_block); + + return poison_stat; +} + +static bool vcn_v2_6_query_poison_status(struct amdgpu_device *adev) +{ + uint32_t inst, sub; + uint32_t poison_stat = 0; + + for (inst = 0; inst < adev->vcn.num_vcn_inst; inst++) + for (sub = 0; sub < AMDGPU_VCN_V2_6_MAX_SUB_BLOCK; sub++) + poison_stat += + vcn_v2_6_query_poison_by_instance(adev, inst, sub); + + return !!poison_stat; +} + +const struct amdgpu_ras_block_hw_ops vcn_v2_6_ras_hw_ops = { + .query_poison_status = vcn_v2_6_query_poison_status, +}; + +static struct amdgpu_vcn_ras vcn_v2_6_ras = { + .ras_block = { + .hw_ops = &vcn_v2_6_ras_hw_ops, + }, +}; + +static void vcn_v2_5_set_ras_funcs(struct amdgpu_device *adev) +{ + switch (adev->ip_versions[VCN_HWIP][0]) { + case IP_VERSION(2, 6, 0): + adev->vcn.ras = &vcn_v2_6_ras; + break; + default: + break; + } + + if (adev->vcn.ras) { + amdgpu_ras_register_ras_block(adev, &adev->vcn.ras->ras_block); + + strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn"); + adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN; + adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON; + adev->vcn.ras_if = &adev->vcn.ras->ras_block.ras_comm; + + /* If don't define special ras_late_init function, use default ras_late_init */ + if (!adev->vcn.ras->ras_block.ras_late_init) + adev->vcn.ras->ras_block.ras_late_init = amdgpu_ras_block_late_init; + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h index e72f799ed0fd..1c19af74e4fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.h @@ -24,6 +24,12 @@ #ifndef __VCN_V2_5_H__ #define __VCN_V2_5_H__ +enum amdgpu_vcn_v2_6_sub_block { + AMDGPU_VCN_V2_6_VCPU_VCODEC = 0, + + AMDGPU_VCN_V2_6_MAX_SUB_BLOCK, +}; + extern const struct amdgpu_ip_block_version vcn_v2_5_ip_block; extern const struct amdgpu_ip_block_version vcn_v2_6_ip_block; -- cgit From 66f879496121c18c541125d04a444c62f3ca82ad Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 19 Apr 2022 11:04:19 +0800 Subject: drm/amdgpu: add RAS poison consumption handler (v2) Add support for general RAS poison consumption handler. v2: remove callback function for poison consumption. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 2 files changed, 35 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f10afc1ea017..1c86ec9ab139 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,6 +1515,38 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ +static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, + struct amdgpu_iv_entry *entry) +{ + bool poison_stat = true, need_reset = true; + struct amdgpu_device *adev = obj->adev; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras_block_object *block_obj = + amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_poison_handler(adev, &err_data, false); + + /* both query_poison_status and handle_poison_consumption are optional */ + if (block_obj && block_obj->hw_ops) { + if (block_obj->hw_ops->query_poison_status) { + poison_stat = block_obj->hw_ops->query_poison_status(adev); + if (!poison_stat) + dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", + block_obj->ras_comm.name); + } + + if (poison_stat && block_obj->hw_ops->handle_poison_consumption) { + poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); + need_reset = poison_stat; + } + } + + /* gpu reset is fallback for all failed cases */ + if (need_reset) + amdgpu_ras_reset_gpu(adev); +} + static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { @@ -1567,6 +1599,8 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_poison_creation_handler(obj, &entry); + else + amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry); } else { if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) amdgpu_ras_interrupt_umc_handler(obj, &entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 606df8869b89..c4b61785ab5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -509,6 +509,7 @@ struct amdgpu_ras_block_hw_ops { void (*reset_ras_error_count)(struct amdgpu_device *adev); void (*reset_ras_error_status)(struct amdgpu_device *adev); bool (*query_poison_status)(struct amdgpu_device *adev); + bool (*handle_poison_consumption)(struct amdgpu_device *adev); }; /* work flow -- cgit From b3c76814ce5b043faa2f07108f1c87ed1cbc8cd1 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 19 Apr 2022 14:45:09 +0800 Subject: drm/amdgpu: add RAS fatal error interrupt handler The fatal error handler is independent from general ras interrupt handler since there is no related IH ring. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 15 +-------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index ea3e8c66211f..b4cf8717f554 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -193,20 +193,7 @@ static irqreturn_t amdgpu_irq_handler(int irq, void *arg) if (ret == IRQ_HANDLED) pm_runtime_mark_last_busy(dev->dev); - /* For the hardware that cannot enable bif ring for both ras_controller_irq - * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status - * register to check whether the interrupt is triggered or not, and properly - * ack the interrupt if it is there - */ - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) { - if (adev->nbio.ras && - adev->nbio.ras->handle_ras_controller_intr_no_bifring) - adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); - - if (adev->nbio.ras && - adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) - adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); - } + amdgpu_ras_interrupt_fatal_error_handler(adev); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1c86ec9ab139..03ce3ce913e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1515,6 +1515,26 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) /* ras fs end */ /* ih begin */ + +/* For the hardware that cannot enable bif ring for both ras_controller_irq + * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status + * register to check whether the interrupt is triggered or not, and properly + * ack the interrupt if it is there + */ +void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) + return; + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_controller_intr_no_bifring) + adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); + + if (adev->nbio.ras && + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) + adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); +} + static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c4b61785ab5c..b9a6fac2b8b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -683,4 +683,5 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct amdgpu_ras_block_object *ras_block_obj); +void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); #endif -- cgit