aboutsummaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/device.c
diff options
context:
space:
mode:
authorTal Cohen <talcohen@habana.ai>2023-03-21 10:59:28 +0200
committerOded Gabbay <ogabbay@kernel.org>2023-04-08 10:39:34 +0300
commit802f25b6c2c0377c681dd1e4f799a648c3df50dd (patch)
tree5d1f273eb477af299e803cc5bdfd186689891aa9 /drivers/accel/habanalabs/common/device.c
parent82a1b48a4e3e8fdb945af63b6fff5304ff5c3c17 (diff)
accel/habanalabs: sync f/w events interrupt in hard reset
Receiving events from FW, while the device is in hard reset, causes a warning message in Driver log. The message may point to a problem in the Driver or FW. But It also can appear as a result of events that have been sent from FW just before the hard reset. In order to avoid receiving events from FW while the device is in reset and is already in 'disabled' mode, sync the f/w events interrupt right before setting the device to 'disabled'. Signed-off-by: Tal Cohen <talcohen@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r--drivers/accel/habanalabs/common/device.c55
1 files changed, 30 insertions, 25 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 3c1af9d43b65..fabfc501ef54 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1380,13 +1380,41 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
mutex_unlock(fd_lock);
}
+static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
+{
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ */
+ if ((flags & HL_DRV_RESET_HARD) &&
+ !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
+ dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+ return;
+ }
+
+ /* verify that last EQs are handled before disabled is set */
+ if (hdev->cpu_queues_enable)
+ synchronize_irq(pci_irq_vector(hdev->pdev,
+ hdev->asic_prop.eq_interrupt_id));
+ }
+}
+
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
{
u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
/* No consecutive mechanism when user context exists */
if (hdev->is_compute_ctx_active)
- goto disable_pci;
+ return;
/*
* 'reset cause' is being updated here, because getting here
@@ -1418,30 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
} else {
hdev->reset_info.reset_trigger_repeated = 1;
}
-
- /* If reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it.
- *
- * If F/W is performing the reset, no need to send it a message to disable
- * PCI access
- */
-
-disable_pci:
- if ((flags & HL_DRV_RESET_HARD) &&
- !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
- /* Disable PCI access from device F/W so he won't send
- * us additional interrupts. We disable MSI/MSI-X at
- * the halt_engines function and we can't have the F/W
- * sending us interrupts after that. We need to disable
- * the access here because if the device is marked
- * disable, the message won't be send. Also, in case
- * of heartbeat, the device CPU is marked as disable
- * so this message won't be sent
- */
- if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
- dev_warn(hdev->dev,
- "Failed to disable FW's PCI access\n");
- }
}
/*
@@ -1562,6 +1566,7 @@ do_reset:
escalate_reset_flow:
handle_reset_trigger(hdev, flags);
+ send_disable_pci_access(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;