1 files changed, 67 insertions, 15 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 00e92b678828..ff4cbde289c0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
 
 static void hpriv_release(struct kref *ref)
 {
+	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+	bool device_is_idle = true;
 	struct hl_fpriv *hpriv;
 	struct hl_device *hdev;
 
@@ -71,8 +73,20 @@ static void hpriv_release(struct kref *ref)
 
 	kfree(hpriv);
 
-	if (hdev->reset_upon_device_release)
-		hl_device_reset(hdev, 0);
+	if ((!hdev->pldm) && (hdev->pdev) &&
+			(!hdev->asic_funcs->is_device_idle(hdev,
+				idle_mask,
+				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
+		dev_err(hdev->dev,
+			"device not idle after user context is closed (0x%llx_%llx)\n",
+			idle_mask[1], idle_mask[0]);
+
+		device_is_idle = false;
+	}
+
+	if ((hdev->reset_if_device_not_idle && !device_is_idle)
+			|| hdev->reset_upon_device_release)
+		hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -118,6 +132,9 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 		dev_warn(hdev->dev,
 			"Device is still in use because there are live CS and/or memory mappings\n");
 
+	hdev->last_open_session_duration_jif =
+		jiffies - hdev->last_successful_open_jif;
+
 	return 0;
 }
 
@@ -868,7 +885,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-	bool hard_reset, from_hard_reset_thread;
+	bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
 	int i, rc;
 
 	if (!hdev->init_done) {
@@ -880,11 +897,28 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hard_reset = (flags & HL_RESET_HARD) != 0;
 	from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
 
-	if ((!hard_reset) && (!hdev->supports_soft_reset)) {
-		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+	if (!hard_reset && !hdev->supports_soft_reset) {
+		hard_instead_soft = true;
+		hard_reset = true;
+	}
+
+	if (hdev->reset_upon_device_release &&
+			(flags & HL_RESET_DEVICE_RELEASE)) {
+		dev_dbg(hdev->dev,
+			"Perform %s-reset upon device release\n",
+			hard_reset ? "hard" : "soft");
+		goto do_reset;
+	}
+
+	if (!hard_reset && !hdev->allow_external_soft_reset) {
+		hard_instead_soft = true;
 		hard_reset = true;
 	}
 
+	if (hard_instead_soft)
+		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+
+do_reset:
 	/* Re-entry of reset thread */
 	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
 		goto kill_processes;
@@ -901,6 +935,19 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			return 0;
 
 		/*
+		 * 'reset cause' is being updated here, because getting here
+		 * means that it's the 1st time and the last time we're here
+		 * ('in_reset' makes sure of it). This makes sure that
+		 * 'reset_cause' will continue holding its 1st recorded reason!
+		 */
+		if (flags & HL_RESET_HEARTBEAT)
+			hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+		else if (flags & HL_RESET_TDR)
+			hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+		else
+			hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+
+		/*
 		 * if reset is due to heartbeat, device CPU is no responsive in
 		 * which case no point sending PCI disable message to it
 		 */
@@ -943,9 +990,8 @@ again:
 		hdev->process_kill_trial_cnt = 0;
 
 		/*
-		 * Because the reset function can't run from interrupt or
-		 * from heartbeat work, we need to call the reset function
-		 * from a dedicated work
+		 * Because the reset function can't run from heartbeat work,
+		 * we need to call the reset function from a dedicated work.
 		 */
 		queue_delayed_work(hdev->device_reset_work.wq,
 			&hdev->device_reset_work.reset_work, 0);
@@ -1096,8 +1142,8 @@ kill_processes:
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
 		dev_err(hdev->dev,
-			"device is not idle (mask %#llx %#llx) after reset\n",
-			idle_mask[0], idle_mask[1]);
+			"device is not idle (mask 0x%llx_%llx) after reset\n",
+			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
 	}
@@ -1334,8 +1380,9 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	}
 
 	/*
-	 * From this point, in case of an error, add char devices and create
-	 * sysfs nodes as part of the error flow, to allow debugging.
+	 * From this point, override rc (=0) in case of an error to allow
+	 * debugging (by adding char devices and create sysfs nodes as part of
+	 * the error flow).
 	 */
 	add_cdev_sysfs_on_err = true;
 
@@ -1369,7 +1416,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
 		hdev->asic_name,
-		hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
+		hdev->asic_prop.dram_size / SZ_1G);
 
 	rc = hl_vm_init(hdev);
 	if (rc) {
@@ -1475,6 +1522,7 @@ out_disabled:
 void hl_device_fini(struct hl_device *hdev)
 {
 	ktime_t timeout;
+	u64 reset_sec;
 	int i, rc;
 
 	dev_info(hdev->dev, "Removing device\n");
@@ -1482,6 +1530,11 @@ void hl_device_fini(struct hl_device *hdev)
 	hdev->device_fini_pending = 1;
 	flush_delayed_work(&hdev->device_reset_work.reset_work);
 
+	if (hdev->pldm)
+		reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT;
+	else
+		reset_sec = HL_HARD_RESET_MAX_TIMEOUT;
+
 	/*
 	 * This function is competing with the reset function, so try to
 	 * take the reset atomic and if we are already in middle of reset,
@@ -1490,8 +1543,7 @@ void hl_device_fini(struct hl_device *hdev)
 	 * ports, the hard reset could take between 10-30 seconds
 	 */
 
-	timeout = ktime_add_us(ktime_get(),
-				HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
+	timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
 	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
 	while (rc) {
 		usleep_range(50, 200);