diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 35 |
1 files changed, 26 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9f0e0a1b41c..908081c85de1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -852,8 +852,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGSEGV to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -861,13 +861,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGTERM to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "HSA Process (PID %d) got unhandled exception", - p->lead_thread->pid); + "Process %d (pasid 0x%x) got unhandled exception", + p->lead_thread->pid, p->pasid); } } } @@ -936,7 +936,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* Workaround on Raven to not kill the process when memory is freed * before IOMMU is able to finish processing all the excessive PPRs */ - if (dev->device_info->asic_family != CHIP_RAVEN) { + if (dev->device_info->asic_family != CHIP_RAVEN && + dev->device_info->asic_family != CHIP_RENOIR) { mutex_lock(&p->event_mutex); /* Lookup events by type and signal them */ @@ -983,7 +984,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, return; /* Presumably process exited. */ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = 1; + memory_exception_data.failure.imprecise = true; /* Set failure reason */ if (info) { memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; @@ -1011,25 +1012,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, void kfd_signal_reset_event(struct kfd_dev *dev) { struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_process *p; struct kfd_event *ev; unsigned int temp; uint32_t id, idx; + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? + KFD_HW_EXCEPTION_ECC : + KFD_HW_EXCEPTION_GPU_HANG; /* Whole gpu reset caused by GPU hang and memory is lost */ memset(&hw_exception_data, 0, sizeof(hw_exception_data)); hw_exception_data.gpu_id = dev->id; hw_exception_data.memory_lost = 1; + hw_exception_data.reset_cause = reset_cause; + + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = true; idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { mutex_lock(&p->event_mutex); id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) + idr_for_each_entry_continue(&p->event_idr, ev, id) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { ev->hw_exception_data = hw_exception_data; set_event(ev); } + if (ev->type == KFD_EVENT_TYPE_MEMORY && + reset_cause == KFD_HW_EXCEPTION_ECC) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } mutex_unlock(&p->event_mutex); } srcu_read_unlock(&kfd_processes_srcu, idx); |