diff options
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 37 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/firmware_if.c | 16 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 5 |
3 files changed, 46 insertions, 12 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 7bd7c2eb5dd2..050c278e5ddb 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1062,11 +1062,28 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (device_id == hdev->pdev->device); } +static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size, + bool is_pq_hb) +{ + time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts + : hdev->heartbeat_debug_info.last_eq_heartbeat_ts; + struct tm tm; + + if (!seconds) + return; + + time64_to_tm(seconds, 0, &tm); + + snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)", + tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); +} + static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) { struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info; u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1; struct asic_fixed_properties *prop = &hdev->asic_prop; + char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A"; if (!prop->cpucp_info.eq_health_check_supported) return true; @@ -1074,13 +1091,17 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) if (!hdev->eq_heartbeat_received) { dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); + stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true); + stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false); dev_err(hdev->dev, - "Heartbeat events counter: %u, EQ CI: %u, PQ PI: %u, PQ CI: %u (%u)\n", - heartbeat_debug_info->heartbeat_event_counter, + "EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n", hdev->event_queue.ci, + heartbeat_debug_info->heartbeat_event_counter, + eq_time_str, hdev->kernel_queues[cpu_q_id].pi, atomic_read(&hdev->kernel_queues[cpu_q_id].ci), - atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask); + atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask, + pq_time_str); hl_eq_dump(hdev, &hdev->event_queue); @@ -1562,12 +1583,19 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) } } +static void reset_heartbeat_debug_info(struct hl_device *hdev) +{ + hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0; + hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0; + hdev->heartbeat_debug_info.heartbeat_event_counter = 0; +} + static inline void device_heartbeat_schedule(struct hl_device *hdev) { if (!hdev->heartbeat) return; - hdev->heartbeat_debug_info.heartbeat_event_counter = 0; + reset_heartbeat_debug_info(hdev); /* * Before scheduling the heartbeat driver will check if eq event has received. @@ -2883,6 +2911,7 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq) void hl_eq_heartbeat_event_handle(struct hl_device *hdev) { hdev->heartbeat_debug_info.heartbeat_event_counter++; + hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds(); hdev->eq_heartbeat_received = true; } diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 3cd8a1f69980..eeb6b2a80fc7 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -466,12 +466,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, } else { struct hl_bd *bd = queue->kernel_address; - bd += hl_pi_2_offset(queue->pi); + bd += hl_pi_2_offset(pi); dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n" - "Pkt info: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n", - tmp, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr, - queue->dram_bd); + "Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n", + tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr, + queue->dram_bd); } hdev->device_cpu_disabled = true; goto out; @@ -681,12 +681,10 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) int rc; memset(&hb_pkt, 0, sizeof(hb_pkt)); - hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << - CPUCP_PKT_CTL_OPCODE_SHIFT); + hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT); hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL); - rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, - sizeof(hb_pkt), 0, &result); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result); if ((rc) || (result != CPUCP_PACKET_FENCE_VAL)) return -EIO; @@ -697,6 +695,8 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) rc = -EIO; } + hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds(); + return rc; } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index ce78b331e244..a06e5a966f45 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3196,10 +3196,15 @@ struct hl_reset_info { /** * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure. + * @last_pq_heartbeat_ts: timestamp of the last test packet that was sent to FW. + * This packet is the trigger in FW to send the EQ heartbeat event. + * @last_eq_heartbeat_ts: timestamp of the last EQ heartbeat event that was received from FW. * @heartbeat_event_counter: number of heartbeat events received. * @cpu_queue_id: used to read the queue pi/ci */ struct eq_heartbeat_debug_info { + time64_t last_pq_heartbeat_ts; + time64_t last_eq_heartbeat_ts; u32 heartbeat_event_counter; u32 cpu_queue_id; }; |