aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_events.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c35
1 files changed, 26 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..908081c85de1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -852,8 +852,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (type == KFD_EVENT_TYPE_MEMORY) {
dev_warn(kfd_device,
- "Sending SIGSEGV to HSA Process with PID %d ",
- p->lead_thread->pid);
+ "Sending SIGSEGV to process %d (pasid 0x%x)",
+ p->lead_thread->pid, p->pasid);
send_sig(SIGSEGV, p->lead_thread, 0);
}
@@ -861,13 +861,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
if (send_signal) {
if (send_sigterm) {
dev_warn(kfd_device,
- "Sending SIGTERM to HSA Process with PID %d ",
- p->lead_thread->pid);
+ "Sending SIGTERM to process %d (pasid 0x%x)",
+ p->lead_thread->pid, p->pasid);
send_sig(SIGTERM, p->lead_thread, 0);
} else {
dev_err(kfd_device,
- "HSA Process (PID %d) got unhandled exception",
- p->lead_thread->pid);
+ "Process %d (pasid 0x%x) got unhandled exception",
+ p->lead_thread->pid, p->pasid);
}
}
}
@@ -936,7 +936,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
/* Workaround on Raven to not kill the process when memory is freed
* before IOMMU is able to finish processing all the excessive PPRs
*/
- if (dev->device_info->asic_family != CHIP_RAVEN) {
+ if (dev->device_info->asic_family != CHIP_RAVEN &&
+ dev->device_info->asic_family != CHIP_RENOIR) {
mutex_lock(&p->event_mutex);
/* Lookup events by type and signal them */
@@ -983,7 +984,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id;
- memory_exception_data.failure.imprecise = 1;
+ memory_exception_data.failure.imprecise = true;
/* Set failure reason */
if (info) {
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
@@ -1011,25 +1012,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
void kfd_signal_reset_event(struct kfd_dev *dev)
{
struct kfd_hsa_hw_exception_data hw_exception_data;
+ struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_process *p;
struct kfd_event *ev;
unsigned int temp;
uint32_t id, idx;
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+ KFD_HW_EXCEPTION_ECC :
+ KFD_HW_EXCEPTION_GPU_HANG;
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
+ hw_exception_data.reset_cause = reset_cause;
+
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- idr_for_each_entry_continue(&p->event_idr, ev, id)
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
set_event(ev);
}
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+ }
mutex_unlock(&p->event_mutex);
}
srcu_read_unlock(&kfd_processes_srcu, idx);