diff options
author | YiPeng Chai <YiPeng.Chai@amd.com> | 2024-06-24 11:33:19 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-06-27 17:32:06 -0400 |
commit | e278849cb2b663bca7dd67ba5d531ecb5b4557df (patch) | |
tree | 5ceaf053818575fe3e7faf76cd4f5eb583cb6b2d | |
parent | 5f08275cfd88609c86ee86d92efdb196d27c732d (diff) |
drm/amdgpu: refine poison consumption interrupt handler
1. The poison fifo is only used for poison consumption
requests.
2. Merge reset requests when poison fifo caches multiple
poison consumption messages
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 11 |
2 files changed, 43 insertions, 23 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6e7c4f1f86da..d3247533d15e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2911,23 +2911,41 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, } static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, - struct ras_poison_msg *poison_msg) + uint32_t msg_count, uint32_t *gpu_reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t reset = poison_msg->reset; - uint16_t pasid = poison_msg->pasid; + uint32_t reset_flags = 0, reset = 0; + struct ras_poison_msg msg; + int ret, i; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (poison_msg->pasid_fn) - poison_msg->pasid_fn(adev, pasid, poison_msg->data); + for (i = 0; i < msg_count; i++) { + ret = amdgpu_ras_get_poison_req(adev, &msg); + if (!ret) + continue; + + if (msg.pasid_fn) + msg.pasid_fn(adev, msg.pasid, msg.data); + + reset_flags |= msg.reset; + } /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset && !con->is_rma) { + if (reset_flags && !con->is_rma) { + if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + else + reset = reset_flags; + flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); + + *gpu_reset = reset; } return 0; @@ -2937,10 +2955,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) { struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t poison_creation_count; + uint32_t poison_creation_count, msg_count; + uint32_t gpu_reset; int ret; - struct ras_poison_msg poison_msg; - enum amdgpu_ras_block ras_block; while (!kthread_should_stop()) { @@ -2951,6 +2968,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; + gpu_reset = 0; do { poison_creation_count = atomic_read(&con->poison_creation_count); @@ -2964,15 +2982,16 @@ static int amdgpu_ras_page_retirement_thread(void *param) } } while (atomic_read(&con->poison_creation_count)); - if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) - continue; - - ras_block = poison_msg.block; - - dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", - ras_block_str(ras_block), ras_block); - - amdgpu_ras_poison_consumption_handler(adev, &poison_msg); + if (ret != -EIO) { + msg_count = kfifo_len(&con->poison_fifo); + if (msg_count) { + ret = amdgpu_ras_poison_consumption_handler(adev, + msg_count, &gpu_reset); + if ((ret != -EIO) && + (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) + atomic_sub(msg_count, &con->page_retirement_req_cnt); + } + } } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20e0e522fb51..2f84bdb8c594 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -293,14 +293,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, amdgpu_ras_error_data_fini(&err_data); } else { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - amdgpu_ras_put_poison_req(adev, - block, pasid, pasid_fn, data, reset); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int ret; + ret = amdgpu_ras_put_poison_req(adev, + block, pasid, pasid_fn, data, reset); + if (!ret) { atomic_inc(&con->page_retirement_req_cnt); - wake_up(&con->page_retirement_wq); + } } } else { if (adev->virt.ops && adev->virt.ops->ras_poison_handler) |