diff options
author | Daniel Vetter <daniel.vetter@ffwll.ch> | 2024-07-05 11:39:22 +0200 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2024-07-05 11:39:23 +0200 |
commit | 71e9f407fd42d8ce28ff40c4d9cda08c9f3c0f99 (patch) | |
tree | 96780d326a1cf69f6f2d3fdaaf6c2e0127342dce /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | 6256274c0182b584e7011077d071f905f2385f64 (diff) | |
parent | 15eb8573ad72a97b8f70e3c88b9bef6ddc861f77 (diff) |
Merge tag 'amd-drm-next-6.11-2024-06-28' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.11-2024-06-28:
amdgpu:
- JPEG 5.x fixes
- More FW loading cleanups
- Misc code cleanups
- GC 12.x fixes
- ASPM fix
- DCN 4.0.1 updates
- SR-IOV fixes
- HDCP fix
- USB4 fixes
- Silence UBSAN warnings
- MES submission fixes
- Update documentation for new products
- DCC updates
- Initial ISP 4.x plumbing
- RAS fixes
- Misc small fixes
amdkfd:
- Fix missing unlock in error path for adding queues
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240628213135.427214-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 197 |
1 files changed, 145 insertions, 52 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 68e9935028db..4edd8e333d36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms @@ -1384,10 +1384,17 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i memset(&qctx, 0, sizeof(qctx)); qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); + + if (!down_read_trylock(&adev->reset_domain->sem)) { + ret = -EIO; + goto out_fini_err_data; + } + ret = amdgpu_ras_query_error_status_helper(adev, info, &err_data, &qctx, error_query_mode); + up_read(&adev->reset_domain->sem); if (ret) goto out_fini_err_data; @@ -2105,10 +2112,8 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); - amdgpu_ras_put_poison_req(obj->adev, - AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false); - atomic_inc(&con->page_retirement_req_cnt); + atomic_inc(&con->poison_creation_count); wake_up(&con->page_retirement_wq); } @@ -2799,7 +2804,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key)); INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); - ecc_log->de_updated = false; + ecc_log->de_queried_count = 0; + ecc_log->prev_de_queried_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -2818,7 +2824,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_unlock(&ecc_log->lock); mutex_destroy(&ecc_log->lock); - ecc_log->de_updated = false; + ecc_log->de_queried_count = 0; + ecc_log->prev_de_queried_count = 0; } static void amdgpu_ras_do_page_retirement(struct work_struct *work) @@ -2850,60 +2857,116 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) mutex_unlock(&con->umc_ecc_log.lock); } -static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, - uint32_t timeout_ms) +static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, + uint32_t poison_creation_count) { int ret = 0; struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = timeout_ms; + uint32_t timeout = 0; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + uint64_t de_queried_count; + uint32_t new_detect_count, total_detect_count; + uint32_t need_query_count = poison_creation_count; + bool query_data_timeout = false; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; ecc_log = &ras->umc_ecc_log; - ecc_log->de_updated = false; + total_detect_count = 0; do { ret = amdgpu_ras_query_error_status(adev, &info); - if (ret) { - dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); - return; + if (ret) + return ret; + + de_queried_count = ecc_log->de_queried_count; + if (de_queried_count > ecc_log->prev_de_queried_count) { + new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; + ecc_log->prev_de_queried_count = de_queried_count; + timeout = 0; + } else { + new_detect_count = 0; } - if (timeout && !ecc_log->de_updated) { - msleep(1); - timeout--; + if (new_detect_count) { + total_detect_count += new_detect_count; + } else { + if (!timeout && need_query_count) + timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + + if (timeout) { + if (!--timeout) { + query_data_timeout = true; + break; + } + msleep(1); + } } - } while (timeout && !ecc_log->de_updated); + } while (total_detect_count < need_query_count); - if (timeout_ms && !timeout) { - dev_warn(adev->dev, "Can't find deferred error\n"); - return; + if (query_data_timeout) { + dev_warn(adev->dev, "Can't find deferred error! count: %u\n", + (need_query_count - total_detect_count)); + return -ENOENT; } - if (!ret) + if (total_detect_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); + + return 0; +} + +static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_poison_msg msg; + int ret; + + do { + ret = kfifo_get(&con->poison_fifo, &msg); + } while (ret); } static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, - struct ras_poison_msg *poison_msg) + uint32_t msg_count, uint32_t *gpu_reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - uint32_t reset = poison_msg->reset; - uint16_t pasid = poison_msg->pasid; + uint32_t reset_flags = 0, reset = 0; + struct ras_poison_msg msg; + int ret, i; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (poison_msg->pasid_fn) - poison_msg->pasid_fn(adev, pasid, poison_msg->data); + for (i = 0; i < msg_count; i++) { + ret = amdgpu_ras_get_poison_req(adev, &msg); + if (!ret) + continue; + + if (msg.pasid_fn) + msg.pasid_fn(adev, msg.pasid, msg.data); + + reset_flags |= msg.reset; + } /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset && !con->is_rma) { + if (reset_flags && !con->is_rma) { + if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + else + reset = reset_flags; + flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); + + *gpu_reset = reset; + + /* Wait for gpu recovery to complete */ + flush_work(&con->recovery_work); } return 0; @@ -2913,9 +2976,9 @@ static int amdgpu_ras_page_retirement_thread(void *param) { struct amdgpu_device *adev = (struct amdgpu_device *)param; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_poison_msg poison_msg; - enum amdgpu_ras_block ras_block; - bool poison_creation_is_handled = false; + uint32_t poison_creation_count, msg_count; + uint32_t gpu_reset; + int ret; while (!kthread_should_stop()) { @@ -2926,33 +2989,61 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; - atomic_dec(&con->page_retirement_req_cnt); + gpu_reset = 0; - if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) - continue; + do { + poison_creation_count = atomic_read(&con->poison_creation_count); + ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); + if (ret == -EIO) + break; - ras_block = poison_msg.block; + if (poison_creation_count) { + atomic_sub(poison_creation_count, &con->poison_creation_count); + atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); + } + } while (atomic_read(&con->poison_creation_count)); + + if (ret != -EIO) { + msg_count = kfifo_len(&con->poison_fifo); + if (msg_count) { + ret = amdgpu_ras_poison_consumption_handler(adev, + msg_count, &gpu_reset); + if ((ret != -EIO) && + (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) + atomic_sub(msg_count, &con->page_retirement_req_cnt); + } + } - dev_dbg(adev->dev, "Start processing ras block %s(%d)\n", - ras_block_str(ras_block), ras_block); + if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { + /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ + /* Clear poison creation request */ + atomic_set(&con->poison_creation_count, 0); - if (ras_block == AMDGPU_RAS_BLOCK__UMC) { - amdgpu_ras_poison_creation_handler(adev, - MAX_UMC_POISON_POLLING_TIME_ASYNC); - poison_creation_is_handled = true; - } else { - /* poison_creation_is_handled: - * false: no poison creation interrupt, but it has poison - * consumption interrupt. - * true: It has poison creation interrupt at the beginning, - * but it has no poison creation interrupt later. - */ - amdgpu_ras_poison_creation_handler(adev, - poison_creation_is_handled ? - 0 : MAX_UMC_POISON_POLLING_TIME_ASYNC); + /* Clear poison fifo */ + amdgpu_ras_clear_poison_fifo(adev); + + /* Clear all poison requests */ + atomic_set(&con->page_retirement_req_cnt, 0); + + if (ret == -EIO) { + /* Wait for mode-1 reset to complete */ + down_read(&adev->reset_domain->sem); + up_read(&adev->reset_domain->sem); + } + + /* Wake up work to save bad pages to eeprom */ + schedule_delayed_work(&con->page_retirement_dwork, 0); + } else if (gpu_reset) { + /* gpu just completed mode-2 reset or other reset */ + /* Clear poison consumption messages cached in fifo */ + msg_count = kfifo_len(&con->poison_fifo); + if (msg_count) { + amdgpu_ras_clear_poison_fifo(adev); + atomic_sub(msg_count, &con->page_retirement_req_cnt); + } - amdgpu_ras_poison_consumption_handler(adev, &poison_msg); - poison_creation_is_handled = false; + /* Wake up work to save bad pages to eeprom */ + schedule_delayed_work(&con->page_retirement_dwork, 0); } } @@ -3026,6 +3117,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->page_retirement_lock); init_waitqueue_head(&con->page_retirement_wq); atomic_set(&con->page_retirement_req_cnt, 0); + atomic_set(&con->poison_creation_count, 0); con->page_retirement_thread = kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); if (IS_ERR(con->page_retirement_thread)) { @@ -3074,6 +3166,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) kthread_stop(con->page_retirement_thread); atomic_set(&con->page_retirement_req_cnt, 0); + atomic_set(&con->poison_creation_count, 0); mutex_destroy(&con->page_rsv_lock); |