diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/firmware_if.c')
-rw-r--r-- | drivers/misc/habanalabs/common/firmware_if.c | 208 |
1 files changed, 139 insertions, 69 deletions
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 0e1c629e9800..ba6920f2b4ab 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -279,8 +279,74 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) return rc; } +static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg, + u32 cpu_security_boot_status_reg) +{ + u32 err_val, security_val; + + /* Some of the firmware status codes are deprecated in newer f/w + * versions. In those versions, the errors are reported + * in different registers. Therefore, we need to check those + * registers and print the exact errors. Moreover, there + * may be multiple errors, so we need to report on each error + * separately. Some of the error codes might indicate a state + * that is not an error per-se, but it is an error in production + * environment + */ + err_val = RREG32(boot_err0_reg); + if (!(err_val & CPU_BOOT_ERR0_ENABLED)) + return 0; + + if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) + dev_err(hdev->dev, + "Device boot error - DRAM initialization failed\n"); + if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) + dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); + if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) + dev_err(hdev->dev, + "Device boot error - Thermal Sensor initialization failed\n"); + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) + dev_warn(hdev->dev, + "Device boot warning - Skipped DRAM initialization\n"); + + if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { + if (hdev->bmc_enable) + dev_warn(hdev->dev, + "Device boot error - Skipped waiting for BMC\n"); + else + err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED; + } + + if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) + dev_err(hdev->dev, + "Device boot error - Serdes data from BMC not available\n"); + if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) + dev_err(hdev->dev, + "Device boot error - NIC F/W initialization failed\n"); + if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) + dev_warn(hdev->dev, + "Device boot warning - security not ready\n"); + if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) + dev_err(hdev->dev, "Device boot error - security failure\n"); + if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) + dev_err(hdev->dev, "Device boot error - eFuse failure\n"); + if (err_val & CPU_BOOT_ERR0_PLL_FAIL) + dev_err(hdev->dev, "Device boot error - PLL failure\n"); + + security_val = RREG32(cpu_security_boot_status_reg); + if (security_val & CPU_BOOT_DEV_STS0_ENABLED) + dev_dbg(hdev->dev, "Device security status %#x\n", + security_val); + + if (err_val & ~CPU_BOOT_ERR0_ENABLED) + return -EIO; + + return 0; +} + int hl_fw_cpucp_info_get(struct hl_device *hdev, - u32 cpu_security_boot_status_reg) + u32 cpu_security_boot_status_reg, + u32 boot_err0_reg) { struct asic_fixed_properties *prop = &hdev->asic_prop; struct cpucp_packet pkt = {}; @@ -314,6 +380,12 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev, goto out; } + rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg); + if (rc) { + dev_err(hdev->dev, "Errors in device boot\n"); + goto out; + } + memcpy(&prop->cpucp_info, cpucp_info_cpu_addr, sizeof(prop->cpucp_info)); @@ -402,6 +474,10 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, } counters->rx_throughput = result; + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + /* Fetch PCI tx counter */ pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), @@ -414,6 +490,7 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, counters->tx_throughput = result; /* Fetch PCI replay counter */ + memset(&pkt, 0, sizeof(pkt)); pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); @@ -478,58 +555,6 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index, return rc; } -static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg, - u32 cpu_security_boot_status_reg) -{ - u32 err_val, security_val; - - /* Some of the firmware status codes are deprecated in newer f/w - * versions. In those versions, the errors are reported - * in different registers. Therefore, we need to check those - * registers and print the exact errors. Moreover, there - * may be multiple errors, so we need to report on each error - * separately. Some of the error codes might indicate a state - * that is not an error per-se, but it is an error in production - * environment - */ - err_val = RREG32(boot_err0_reg); - if (!(err_val & CPU_BOOT_ERR0_ENABLED)) - return; - - if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) - dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); - if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) - dev_warn(hdev->dev, - "Device boot warning - Skipped DRAM initialization\n"); - if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) - dev_warn(hdev->dev, - "Device boot error - Skipped waiting for BMC\n"); - if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) - dev_err(hdev->dev, - "Device boot error - Serdes data from BMC not available\n"); - if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) - dev_err(hdev->dev, - "Device boot error - NIC F/W initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) - dev_warn(hdev->dev, - "Device boot warning - security not ready\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) - dev_err(hdev->dev, "Device boot error - security failure\n"); - if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) - dev_err(hdev->dev, "Device boot error - eFuse failure\n"); - - security_val = RREG32(cpu_security_boot_status_reg); - if (security_val & CPU_BOOT_DEV_STS0_ENABLED) - dev_dbg(hdev->dev, "Device security status %#x\n", - security_val); -} - static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) { /* Some of the status codes below are deprecated in newer f/w @@ -627,25 +652,41 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, security_status = RREG32(cpu_security_boot_status_reg); /* We read security status multiple times during boot: - * 1. preboot - we check if fw security feature is supported - * 2. boot cpu - we get boot cpu security status - * 3. FW application - we get FW application security status + * 1. preboot - a. Check whether the security status bits are valid + * b. Check whether fw security is enabled + * c. Check whether hard reset is done by preboot + * 2. boot cpu - a. Fetch boot cpu security status + * b. Check whether hard reset is done by boot cpu + * 3. FW application - a. Fetch fw application security status + * b. Check whether hard reset is done by fw app * * Preboot: * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED), if it is set * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN) */ if (security_status & CPU_BOOT_DEV_STS0_ENABLED) { - hdev->asic_prop.fw_security_status_valid = 1; - prop->fw_security_disabled = - !(security_status & CPU_BOOT_DEV_STS0_SECURITY_EN); + prop->fw_security_status_valid = 1; + + if (security_status & CPU_BOOT_DEV_STS0_SECURITY_EN) + prop->fw_security_disabled = false; + else + prop->fw_security_disabled = true; + + if (security_status & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) + prop->hard_reset_done_by_fw = true; } else { - hdev->asic_prop.fw_security_status_valid = 0; + prop->fw_security_status_valid = 0; prop->fw_security_disabled = true; } + dev_dbg(hdev->dev, "Firmware preboot security status %#x\n", + security_status); + + dev_dbg(hdev->dev, "Firmware preboot hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + dev_info(hdev->dev, "firmware-level security is %s\n", - prop->fw_security_disabled ? "disabled" : "enabled"); + prop->fw_security_disabled ? "disabled" : "enabled"); return 0; } @@ -655,6 +696,7 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, u32 cpu_security_boot_status_reg, u32 boot_err0_reg, bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout) { + struct asic_fixed_properties *prop = &hdev->asic_prop; u32 status; int rc; @@ -723,11 +765,26 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, /* Read U-Boot version now in case we will later fail */ hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT); + /* Clear reset status since we need to read it again from boot CPU */ + prop->hard_reset_done_by_fw = false; + /* Read boot_cpu security bits */ - if (hdev->asic_prop.fw_security_status_valid) - hdev->asic_prop.fw_boot_cpu_security_map = + if (prop->fw_security_status_valid) { + prop->fw_boot_cpu_security_map = RREG32(cpu_security_boot_status_reg); + if (prop->fw_boot_cpu_security_map & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) + prop->hard_reset_done_by_fw = true; + + dev_dbg(hdev->dev, + "Firmware boot CPU security status %#x\n", + prop->fw_boot_cpu_security_map); + } + + dev_dbg(hdev->dev, "Firmware boot CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + if (rc) { detect_cpu_boot_status(hdev, status); rc = -EIO; @@ -796,21 +853,34 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, goto out; } + rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg); + if (rc) + return rc; + + /* Clear reset status since we need to read again from app */ + prop->hard_reset_done_by_fw = false; + /* Read FW application security bits */ - if (hdev->asic_prop.fw_security_status_valid) { - hdev->asic_prop.fw_app_security_map = + if (prop->fw_security_status_valid) { + prop->fw_app_security_map = RREG32(cpu_security_boot_status_reg); - if (hdev->asic_prop.fw_app_security_map & + if (prop->fw_app_security_map & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) - hdev->asic_prop.hard_reset_done_by_fw = true; + prop->hard_reset_done_by_fw = true; + + dev_dbg(hdev->dev, + "Firmware application CPU security status %#x\n", + prop->fw_app_security_map); } - dev_dbg(hdev->dev, "Firmware hard-reset is %s\n", - hdev->asic_prop.hard_reset_done_by_fw ? "enabled" : "disabled"); + dev_dbg(hdev->dev, "Firmware application CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); dev_info(hdev->dev, "Successfully loaded firmware to device\n"); + return 0; + out: fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg); |