aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/misc/habanalabs/command_submission.c1
-rw-r--r--drivers/misc/habanalabs/device.c23
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c74
-rw-r--r--drivers/misc/habanalabs/goya/goya.c35
-rw-r--r--drivers/misc/habanalabs/habanalabs.h10
-rw-r--r--drivers/misc/habanalabs/memory.c35
-rw-r--r--drivers/misc/habanalabs/sysfs.c5
7 files changed, 126 insertions, 57 deletions
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 7da9847eb9d6..75d8302352e5 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -99,7 +99,6 @@ static const struct dma_fence_ops hl_fence_ops = {
.get_driver_name = hl_fence_get_driver_name,
.get_timeline_name = hl_fence_get_timeline_name,
.enable_signaling = hl_fence_enable_signaling,
- .wait = dma_fence_default_wait,
.release = hl_fence_release
};
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 4b6c8de46dd8..2b38a119704c 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -726,7 +726,7 @@ disable_device:
return rc;
}
-static void device_kill_open_processes(struct hl_device *hdev)
+static int device_kill_open_processes(struct hl_device *hdev)
{
u16 pending_total, pending_cnt;
struct hl_fpriv *hpriv;
@@ -779,9 +779,7 @@ static void device_kill_open_processes(struct hl_device *hdev)
ssleep(1);
}
- if (!list_empty(&hdev->fpriv_list))
- dev_crit(hdev->dev,
- "Going to hard reset with open user contexts\n");
+ return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
}
static void device_hard_reset_pending(struct work_struct *work)
@@ -801,6 +799,7 @@ static void device_hard_reset_pending(struct work_struct *work)
* @hdev: pointer to habanalabs device structure
* @hard_reset: should we do hard reset to all engines or just reset the
* compute/dma engines
+ * @from_hard_reset_thread: is the caller the hard-reset thread
*
* Block future CS and wait for pending CS to be enqueued
* Call ASIC H/W fini
@@ -823,6 +822,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
return 0;
}
+ if ((!hard_reset) && (!hdev->supports_soft_reset)) {
+ dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+ hard_reset = true;
+ }
+
/*
* Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't
@@ -902,7 +906,12 @@ again:
* process can't really exit until all its CSs are done, which
* is what we do in cs rollback
*/
- device_kill_open_processes(hdev);
+ rc = device_kill_open_processes(hdev);
+ if (rc) {
+ dev_crit(hdev->dev,
+ "Failed to kill all open processes, stopping hard reset\n");
+ goto out_err;
+ }
/* Flush the Event queue workers to make sure no other thread is
* reading or writing to registers during the reset
@@ -1385,7 +1394,9 @@ void hl_device_fini(struct hl_device *hdev)
* can't really exit until all its CSs are done, which is what we
* do in cs rollback
*/
- device_kill_open_processes(hdev);
+ rc = device_kill_open_processes(hdev);
+ if (rc)
+ dev_crit(hdev->dev, "Failed to kill all open processes\n");
hl_cb_pool_fini(hdev);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4cb1f71dd4f1..61f88e9884ce 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT);
u8 cause;
- bool soft_reset_required;
+ bool reset_required;
gaudi->events_stat[event_type]++;
gaudi->events_stat_aggregate[event_type]++;
@@ -5840,12 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_DEC:
case GAUDI_EVENT_TPC7_DEC:
gaudi_print_irq_info(hdev, event_type, true);
- soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+ reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
- if (soft_reset_required)
- hl_device_reset(hdev, false, false);
- hl_fw_unmask_irq(hdev, event_type);
+ if (reset_required) {
+ dev_err(hdev->dev, "hard reset required due to %s\n",
+ gaudi_irq_map_table[event_type].name);
+
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ } else {
+ hl_fw_unmask_irq(hdev, event_type);
+ }
break;
case GAUDI_EVENT_TPC0_KRN_ERR:
@@ -5857,12 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true);
- soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+ reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
- if (soft_reset_required)
- hl_device_reset(hdev, false, false);
- hl_fw_unmask_irq(hdev, event_type);
+ if (reset_required) {
+ dev_err(hdev->dev, "hard reset required due to %s\n",
+ gaudi_irq_map_table[event_type].name);
+
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ } else {
+ hl_fw_unmask_irq(hdev, event_type);
+ }
break;
case GAUDI_EVENT_PCIE_CORE_SERR:
@@ -5913,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
- hl_device_reset(hdev, false, false);
- hl_fw_unmask_irq(hdev, event_type);
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
break;
case GAUDI_EVENT_TPC0_BMON_SPMU:
@@ -5963,7 +5975,7 @@ static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
return gaudi->events_stat;
}
-static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -5972,34 +5984,40 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
- return;
-
- mutex_lock(&hdev->mmu_cache_lock);
+ return 0;
if (hdev->pldm)
timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC;
else
timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
+ mutex_lock(&hdev->mmu_cache_lock);
+
/* L0 & L1 invalidation */
- WREG32(mmSTLB_INV_ALL_START, 1);
+ WREG32(mmSTLB_INV_PS, 2);
rc = hl_poll_timeout(
hdev,
- mmSTLB_INV_ALL_START,
+ mmSTLB_INV_PS,
status,
!status,
1000,
timeout_usec);
- if (rc)
- dev_notice_ratelimited(hdev->dev,
- "Timeout when waiting for MMU cache invalidation\n");
+ WREG32(mmSTLB_INV_SET, 0);
mutex_unlock(&hdev->mmu_cache_lock);
+
+ if (rc) {
+ dev_err_ratelimited(hdev->dev,
+ "MMU cache invalidation timeout\n");
+ hl_device_reset(hdev, true, false);
+ }
+
+ return rc;
}
-static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
+static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
bool is_hard, u32 asid, u64 va, u64 size)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -6010,7 +6028,7 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
- return;
+ return 0;
mutex_lock(&hdev->mmu_cache_lock);
@@ -6041,11 +6059,15 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
1000,
timeout_usec);
- if (rc)
- dev_notice_ratelimited(hdev->dev,
- "Timeout when waiting for MMU cache invalidation\n");
-
mutex_unlock(&hdev->mmu_cache_lock);
+
+ if (rc) {
+ dev_err_ratelimited(hdev->dev,
+ "MMU cache invalidation timeout\n");
+ hl_device_reset(hdev, true, false);
+ }
+
+ return rc;
}
static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 15b6c3228e37..0d2952bb58df 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)
spin_lock_init(&goya->hw_queues_lock);
hdev->supports_coresight = true;
+ hdev->supports_soft_reset = true;
return 0;
@@ -4883,7 +4884,7 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
}
-static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{
struct goya_device *goya = hdev->asic_specific;
@@ -4892,11 +4893,11 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
- return;
+ return 0;
/* no need in L1 only invalidation in Goya */
if (!is_hard)
- return;
+ return 0;
if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@@ -4918,13 +4919,17 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
mutex_unlock(&hdev->mmu_cache_lock);
- if (rc)
- dev_notice_ratelimited(hdev->dev,
- "Timeout when waiting for MMU cache invalidation\n");
+ if (rc) {
+ dev_err_ratelimited(hdev->dev,
+ "MMU cache invalidation timeout\n");
+ hl_device_reset(hdev, true, false);
+ }
+
+ return rc;
}
-static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
- bool is_hard, u32 asid, u64 va, u64 size)
+static int goya_mmu_invalidate_cache_range(struct hl_device *hdev,
+ bool is_hard, u32 asid, u64 va, u64 size)
{
struct goya_device *goya = hdev->asic_specific;
u32 status, timeout_usec, inv_data, pi;
@@ -4932,11 +4937,11 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
- return;
+ return 0;
/* no need in L1 only invalidation in Goya */
if (!is_hard)
- return;
+ return 0;
if (hdev->pldm)
timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@@ -4969,9 +4974,13 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
mutex_unlock(&hdev->mmu_cache_lock);
- if (rc)
- dev_notice_ratelimited(hdev->dev,
- "Timeout when waiting for MMU cache invalidation\n");
+ if (rc) {
+ dev_err_ratelimited(hdev->dev,
+ "MMU cache invalidation timeout\n");
+ hl_device_reset(hdev, true, false);
+ }
+
+ return rc;
}
int goya_send_heartbeat(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 5a855b7edf43..1ecdcf8b763a 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -675,9 +675,9 @@ struct hl_asic_funcs {
u32 *size);
u64 (*read_pte)(struct hl_device *hdev, u64 addr);
void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
- void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
+ int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
u32 flags);
- void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
+ int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
void (*enable_clock_gating)(struct hl_device *hdev);
@@ -755,8 +755,8 @@ struct hl_va_range {
* with huge pages.
* @dram_va_range: holds available virtual addresses for DRAM mappings.
* @mem_hash_lock: protects the mem_hash.
- * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the
- * MMU hash or walking the PGT requires talking this lock
+ * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
+ * MMU hash or walking the PGT requires talking this lock.
* @debugfs_list: node in debugfs list of contexts.
* @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
* to user so user could inquire about CS. It is used as
@@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
* @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
* @supports_coresight: is CoreSight supported.
+ * @supports_soft_reset: is soft reset supported.
*/
struct hl_device {
struct pci_dev *pdev;
@@ -1522,6 +1523,7 @@ struct hl_device {
u8 stop_on_err;
u8 supports_sync_stream;
u8 supports_coresight;
+ u8 supports_soft_reset;
/* Parameters for bring-up */
u8 mmu_enable;
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index a72f766ca470..4b8eed1ca513 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -886,6 +886,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
vm_type = (enum vm_type_t *) userptr;
hint_addr = args->map_host.hint_addr;
+ handle = phys_pg_pack->handle;
} else {
handle = lower_32_bits(args->map_device.handle);
@@ -954,10 +955,17 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
goto map_err;
}
- hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
+ rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
mutex_unlock(&ctx->mmu_lock);
+ if (rc) {
+ dev_err(hdev->dev,
+ "mapping handle %u failed due to MMU cache invalidation\n",
+ handle);
+ goto map_err;
+ }
+
ret_vaddr += phys_pg_pack->offset;
hnode->ptr = vm_type;
@@ -1083,21 +1091,34 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
* at the loop end rather than for each iteration
*/
if (!ctx_free)
- hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
+ rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
+ *vm_type);
mutex_unlock(&ctx->mmu_lock);
/*
- * No point in maintaining the free VA block list if the context is
- * closing as the list will be freed anyway
+ * If the context is closing we don't need to check for the MMU cache
+ * invalidation return code and update the VA free list as in this flow
+ * we invalidate the MMU cache outside of this unmap function and the VA
+ * free list will be freed anyway.
*/
if (!ctx_free) {
- rc = add_va_block(hdev, va_range, vaddr,
- vaddr + phys_pg_pack->total_size - 1);
+ int tmp_rc;
+
if (rc)
+ dev_err(hdev->dev,
+ "unmapping vaddr 0x%llx failed due to MMU cache invalidation\n",
+ vaddr);
+
+ tmp_rc = add_va_block(hdev, va_range, vaddr,
+ vaddr + phys_pg_pack->total_size - 1);
+ if (tmp_rc) {
dev_warn(hdev->dev,
"add va block failed for vaddr: 0x%llx\n",
vaddr);
+ if (!rc)
+ rc = tmp_rc;
+ }
}
atomic_dec(&phys_pg_pack->mapping_cnt);
@@ -1108,7 +1129,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
dma_unmap_host_va(hdev, userptr);
}
- return 0;
+ return rc;
mapping_cnt_err:
if (is_userptr)
diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c
index e4454414d0e1..5d78d5e1c782 100644
--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/sysfs.c
@@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev,
goto out;
}
+ if (!hdev->supports_soft_reset) {
+ dev_err(hdev->dev, "Device does not support soft-reset\n");
+ goto out;
+ }
+
dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
hl_device_reset(hdev, false, false);