aboutsummaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common/device.c')
-rw-r--r--drivers/misc/habanalabs/common/device.c82
1 files changed, 67 insertions, 15 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 00e92b678828..ff4cbde289c0 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
static void hpriv_release(struct kref *ref)
{
+ u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+ bool device_is_idle = true;
struct hl_fpriv *hpriv;
struct hl_device *hdev;
@@ -71,8 +73,20 @@ static void hpriv_release(struct kref *ref)
kfree(hpriv);
- if (hdev->reset_upon_device_release)
- hl_device_reset(hdev, 0);
+ if ((!hdev->pldm) && (hdev->pdev) &&
+ (!hdev->asic_funcs->is_device_idle(hdev,
+ idle_mask,
+ HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
+ dev_err(hdev->dev,
+ "device not idle after user context is closed (0x%llx_%llx)\n",
+ idle_mask[1], idle_mask[0]);
+
+ device_is_idle = false;
+ }
+
+ if ((hdev->reset_if_device_not_idle && !device_is_idle)
+ || hdev->reset_upon_device_release)
+ hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
}
void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -118,6 +132,9 @@ static int hl_device_release(struct inode *inode, struct file *filp)
dev_warn(hdev->dev,
"Device is still in use because there are live CS and/or memory mappings\n");
+ hdev->last_open_session_duration_jif =
+ jiffies - hdev->last_successful_open_jif;
+
return 0;
}
@@ -868,7 +885,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
- bool hard_reset, from_hard_reset_thread;
+ bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
int i, rc;
if (!hdev->init_done) {
@@ -880,11 +897,28 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hard_reset = (flags & HL_RESET_HARD) != 0;
from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
- if ((!hard_reset) && (!hdev->supports_soft_reset)) {
- dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+ if (!hard_reset && !hdev->supports_soft_reset) {
+ hard_instead_soft = true;
+ hard_reset = true;
+ }
+
+ if (hdev->reset_upon_device_release &&
+ (flags & HL_RESET_DEVICE_RELEASE)) {
+ dev_dbg(hdev->dev,
+ "Perform %s-reset upon device release\n",
+ hard_reset ? "hard" : "soft");
+ goto do_reset;
+ }
+
+ if (!hard_reset && !hdev->allow_external_soft_reset) {
+ hard_instead_soft = true;
hard_reset = true;
}
+ if (hard_instead_soft)
+ dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+
+do_reset:
/* Re-entry of reset thread */
if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
goto kill_processes;
@@ -901,6 +935,19 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return 0;
/*
+ * 'reset cause' is being updated here, because getting here
+ * means that it's the 1st time and the last time we're here
+ * ('in_reset' makes sure of it). This makes sure that
+ * 'reset_cause' will continue holding its 1st recorded reason!
+ */
+ if (flags & HL_RESET_HEARTBEAT)
+ hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+ else if (flags & HL_RESET_TDR)
+ hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
+ else
+ hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+
+ /*
* if reset is due to heartbeat, device CPU is no responsive in
* which case no point sending PCI disable message to it
*/
@@ -943,9 +990,8 @@ again:
hdev->process_kill_trial_cnt = 0;
/*
- * Because the reset function can't run from interrupt or
- * from heartbeat work, we need to call the reset function
- * from a dedicated work
+ * Because the reset function can't run from heartbeat work,
+ * we need to call the reset function from a dedicated work.
*/
queue_delayed_work(hdev->device_reset_work.wq,
&hdev->device_reset_work.reset_work, 0);
@@ -1096,8 +1142,8 @@ kill_processes:
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev,
- "device is not idle (mask %#llx %#llx) after reset\n",
- idle_mask[0], idle_mask[1]);
+ "device is not idle (mask 0x%llx_%llx) after reset\n",
+ idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
}
@@ -1334,8 +1380,9 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
}
/*
- * From this point, in case of an error, add char devices and create
- * sysfs nodes as part of the error flow, to allow debugging.
+ * From this point, override rc (=0) in case of an error to allow
+ * debugging (by adding char devices and create sysfs nodes as part of
+ * the error flow).
*/
add_cdev_sysfs_on_err = true;
@@ -1369,7 +1416,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
hdev->asic_name,
- hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
+ hdev->asic_prop.dram_size / SZ_1G);
rc = hl_vm_init(hdev);
if (rc) {
@@ -1475,6 +1522,7 @@ out_disabled:
void hl_device_fini(struct hl_device *hdev)
{
ktime_t timeout;
+ u64 reset_sec;
int i, rc;
dev_info(hdev->dev, "Removing device\n");
@@ -1482,6 +1530,11 @@ void hl_device_fini(struct hl_device *hdev)
hdev->device_fini_pending = 1;
flush_delayed_work(&hdev->device_reset_work.reset_work);
+ if (hdev->pldm)
+ reset_sec = HL_PLDM_HARD_RESET_MAX_TIMEOUT;
+ else
+ reset_sec = HL_HARD_RESET_MAX_TIMEOUT;
+
/*
* This function is competing with the reset function, so try to
* take the reset atomic and if we are already in middle of reset,
@@ -1490,8 +1543,7 @@ void hl_device_fini(struct hl_device *hdev)
* ports, the hard reset could take between 10-30 seconds
*/
- timeout = ktime_add_us(ktime_get(),
- HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
+ timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
while (rc) {
usleep_range(50, 200);