diff options
author | Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> | 2023-11-17 14:51:47 -0800 |
---|---|---|
committer | Rodrigo Vivi <rodrigo.vivi@intel.com> | 2023-12-21 11:45:06 -0500 |
commit | dd0e89e5edc20d3875ed7ded48e7e97118cdfbc8 (patch) | |
tree | eabea4b033dbcaa883308b551b395ec4142e3b1d | |
parent | 985d5a49e8454d64a01ab362e9091788eeed1839 (diff) |
drm/xe/gsc: GSC FW load
The GSC FW must be copied in a 4MB stolen memory allocation, whose GGTT
address is then passed as a parameter to a dedicated load instruction
submitted via the GSC engine.
Since the GSC load is relatively slow (up to 250ms), we perform it
asynchronously via a worker. This requires us to make sure that the
worker has stopped before suspending/unloading.
Note that we can't yet use xe_migrate_copy for the copy because it
doesn't work with stolen memory right now, so we do a memcpy from the
CPU side instead.
v2: add comment about timeout value, fix GSC status checking
before load (John)
Bspec: 65306, 65346
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
-rw-r--r-- | drivers/gpu/drm/xe/instructions/xe_gsc_commands.h | 34 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/instructions/xe_instr_defs.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/regs/xe_gsc_regs.h | 29 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_gsc.c | 250 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_gsc.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_gsc_types.h | 17 | ||||
-rw-r--r-- | drivers/gpu/drm/xe/xe_uc.c | 12 |
7 files changed, 345 insertions, 1 deletions
diff --git a/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h new file mode 100644 index 000000000000..c7a833d7f965 --- /dev/null +++ b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_COMMANDS_H_ +#define _XE_GSC_COMMANDS_H_ + +#include "instructions/xe_instr_defs.h" + +/* + * All GSCCS-specific commands have fixed length, so we can include it in the + * defines. Note that the generic GSC command header structure includes an + * optional data field in bits 9-21, but there are no commands that actually use + * it; some of the commands are instead defined as having an extended length + * field spanning bits 0-15, even if the extra bits are not required because the + * longest GSCCS command is only 8 dwords. To handle this, the defines below use + * a single field for both data and len. If we ever get a commands that does + * actually have data and this approach doesn't work for it we can re-work it + * at that point. + */ + +#define GSC_OPCODE REG_GENMASK(28, 22) +#define GSC_CMD_DATA_AND_LEN REG_GENMASK(21, 0) + +#define __GSC_INSTR(op, dl) \ + (XE_INSTR_GSC | \ + REG_FIELD_PREP(GSC_OPCODE, op) | \ + REG_FIELD_PREP(GSC_CMD_DATA_AND_LEN, dl)) + +#define GSC_FW_LOAD __GSC_INSTR(1, 2) +#define GSC_FW_LOAD_LIMIT_VALID REG_BIT(31) + +#endif diff --git a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h index e403b4fcc20a..04179b2a48e1 100644 --- a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h +++ b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h @@ -15,6 +15,7 @@ */ #define XE_INSTR_CMD_TYPE GENMASK(31, 29) #define XE_INSTR_MI REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x0) +#define XE_INSTR_GSC REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x2) #define XE_INSTR_GFXPIPE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x3) /* diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h new file mode 100644 index 000000000000..22d2ad9cb64d --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_REGS_H_ +#define _XE_GSC_REGS_H_ + +#include <linux/compiler.h> +#include <linux/types.h> + +#include "regs/xe_reg_defs.h" + +/* Definitions of GSC H/W registers, bits, etc */ + +#define MTL_GSC_HECI1_BASE 0x00116000 +#define MTL_GSC_HECI2_BASE 0x00117000 + +/* + * The FWSTS register values are FW defined and can be different between + * HECI1 and HECI2 + */ +#define HECI_FWSTS1(base) XE_REG((base) + 0xc40) +#define HECI1_FWSTS1_CURRENT_STATE REG_GENMASK(3, 0) +#define HECI1_FWSTS1_CURRENT_STATE_RESET 0 +#define HECI1_FWSTS1_PROXY_STATE_NORMAL 5 +#define HECI1_FWSTS1_INIT_COMPLETE REG_BIT(9) + +#endif diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c index 216f36cee0f7..e014b829bc8a 100644 --- a/drivers/gpu/drm/xe/xe_gsc.c +++ b/drivers/gpu/drm/xe/xe_gsc.c @@ -5,10 +5,20 @@ #include "xe_gsc.h" +#include <drm/drm_managed.h> + +#include "xe_bb.h" +#include "xe_bo.h" #include "xe_device.h" +#include "xe_exec_queue.h" #include "xe_gt.h" #include "xe_gt_printk.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_sched_job.h" #include "xe_uc_fw.h" +#include "instructions/xe_gsc_commands.h" +#include "regs/xe_gsc_regs.h" static struct xe_gt * gsc_to_gt(struct xe_gsc *gsc) @@ -16,6 +26,145 @@ gsc_to_gt(struct xe_gsc *gsc) return container_of(gsc, struct xe_gt, uc.gsc); } +static int memcpy_fw(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + u32 fw_size = gsc->fw.size; + void *storage; + + /* + * FIXME: xe_migrate_copy does not work with stolen mem yet, so we use + * a memcpy for now. + */ + storage = kmalloc(fw_size, GFP_KERNEL); + if (!storage) + return -ENOMEM; + + xe_map_memcpy_from(xe, storage, &gsc->fw.bo->vmap, 0, fw_size); + xe_map_memcpy_to(xe, &gsc->private->vmap, 0, storage, fw_size); + xe_map_memset(xe, &gsc->private->vmap, fw_size, 0, gsc->private->size - fw_size); + + kfree(storage); + + return 0; +} + +static int emit_gsc_upload(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + u64 offset = xe_bo_ggtt_addr(gsc->private); + struct xe_bb *bb; + struct xe_sched_job *job; + struct dma_fence *fence; + long timeout; + + bb = xe_bb_new(gt, 4, false); + if (IS_ERR(bb)) + return PTR_ERR(bb); + + bb->cs[bb->len++] = GSC_FW_LOAD; + bb->cs[bb->len++] = lower_32_bits(offset); + bb->cs[bb->len++] = upper_32_bits(offset); + bb->cs[bb->len++] = (gsc->private->size / SZ_4K) | GSC_FW_LOAD_LIMIT_VALID; + + job = xe_bb_create_job(gsc->q, bb); + if (IS_ERR(job)) { + xe_bb_free(bb, NULL); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + xe_bb_free(bb, NULL); + if (timeout < 0) + return timeout; + else if (!timeout) + return -ETIME; + + return 0; +} + +static int gsc_fw_is_loaded(struct xe_gt *gt) +{ + return xe_mmio_read32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE)) & + HECI1_FWSTS1_INIT_COMPLETE; +} + +static int gsc_fw_wait(struct xe_gt *gt) +{ + /* + * GSC load can take up to 250ms from the moment the instruction is + * executed by the GSCCS. To account for possible submission delays or + * other issues, we use a 500ms timeout in the wait here. + */ + return xe_mmio_wait32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE), + HECI1_FWSTS1_INIT_COMPLETE, + HECI1_FWSTS1_INIT_COMPLETE, + 500 * USEC_PER_MSEC, NULL, false); +} + +static int gsc_upload(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + int err; + + /* we should only be here if the init step were successful */ + xe_assert(xe, xe_uc_fw_is_loadable(&gsc->fw) && gsc->q); + + if (gsc_fw_is_loaded(gt)) { + xe_gt_err(gt, "GSC already loaded at upload time\n"); + return -EEXIST; + } + + err = memcpy_fw(gsc); + if (err) { + xe_gt_err(gt, "Failed to memcpy GSC FW\n"); + return err; + } + + err = emit_gsc_upload(gsc); + if (err) { + xe_gt_err(gt, "Failed to emit GSC FW upload (%pe)\n", ERR_PTR(err)); + return err; + } + + err = gsc_fw_wait(gt); + if (err) { + xe_gt_err(gt, "Failed to wait for GSC load (%pe)\n", ERR_PTR(err)); + return err; + } + + xe_gt_dbg(gt, "GSC FW async load completed\n"); + + return 0; +} + +static void gsc_work(struct work_struct *work) +{ + struct xe_gsc *gsc = container_of(work, typeof(*gsc), work); + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + int ret; + + xe_device_mem_access_get(xe); + xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC); + + ret = gsc_upload(gsc); + if (ret && ret != -EEXIST) + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOAD_FAIL); + else + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GSC); + xe_device_mem_access_put(xe); +} + int xe_gsc_init(struct xe_gsc *gsc) { struct xe_gt *gt = gsc_to_gt(gsc); @@ -23,6 +172,7 @@ int xe_gsc_init(struct xe_gsc *gsc) int ret; gsc->fw.type = XE_UC_FW_TYPE_GSC; + INIT_WORK(&gsc->work, gsc_work); /* The GSC uC is only available on the media GT */ if (tile->media_gt && (gt != tile->media_gt)) { @@ -50,3 +200,103 @@ out: return ret; } +static void free_resources(struct drm_device *drm, void *arg) +{ + struct xe_gsc *gsc = arg; + + if (gsc->wq) { + destroy_workqueue(gsc->wq); + gsc->wq = NULL; + } + + if (gsc->q) { + xe_exec_queue_put(gsc->q); + gsc->q = NULL; + } + + if (gsc->private) { + xe_bo_unpin_map_no_vm(gsc->private); + gsc->private = NULL; + } +} + +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct xe_hw_engine *hwe = xe_gt_hw_engine(gt, XE_ENGINE_CLASS_OTHER, 0, true); + struct xe_exec_queue *q; + struct workqueue_struct *wq; + struct xe_bo *bo; + int err; + + if (!xe_uc_fw_is_available(&gsc->fw)) + return 0; + + if (!hwe) + return -ENODEV; + + bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4M, + ttm_bo_type_kernel, + XE_BO_CREATE_STOLEN_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + q = xe_exec_queue_create(xe, NULL, + BIT(hwe->logical_instance), 1, hwe, + EXEC_QUEUE_FLAG_KERNEL | + EXEC_QUEUE_FLAG_PERMANENT); + if (IS_ERR(q)) { + xe_gt_err(gt, "Failed to create queue for GSC submission\n"); + err = PTR_ERR(q); + goto out_bo; + } + + wq = alloc_ordered_workqueue("gsc-ordered-wq", 0); + if (!wq) { + err = -ENOMEM; + goto out_q; + } + + gsc->private = bo; + gsc->q = q; + gsc->wq = wq; + + err = drmm_add_action_or_reset(&xe->drm, free_resources, gsc); + if (err) + return err; + + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOADABLE); + + return 0; + +out_q: + xe_exec_queue_put(q); +out_bo: + xe_bo_unpin_map_no_vm(bo); + return err; +} + +void xe_gsc_load_start(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + + if (!xe_uc_fw_is_loadable(&gsc->fw) || !gsc->q) + return; + + /* GSC FW survives GT reset and D3Hot */ + if (gsc_fw_is_loaded(gt)) { + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED); + return; + } + + queue_work(gsc->wq, &gsc->work); +} + +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc) +{ + if (xe_uc_fw_is_loadable(&gsc->fw) && gsc->wq) + flush_work(&gsc->work); +} diff --git a/drivers/gpu/drm/xe/xe_gsc.h b/drivers/gpu/drm/xe/xe_gsc.h index baa7f21f4204..f870eddc77d4 100644 --- a/drivers/gpu/drm/xe/xe_gsc.h +++ b/drivers/gpu/drm/xe/xe_gsc.h @@ -9,5 +9,8 @@ #include "xe_gsc_types.h" int xe_gsc_init(struct xe_gsc *gsc); +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc); +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc); +void xe_gsc_load_start(struct xe_gsc *gsc); #endif diff --git a/drivers/gpu/drm/xe/xe_gsc_types.h b/drivers/gpu/drm/xe/xe_gsc_types.h index 1bc50583fe58..57fefd66a7ea 100644 --- a/drivers/gpu/drm/xe/xe_gsc_types.h +++ b/drivers/gpu/drm/xe/xe_gsc_types.h @@ -6,8 +6,13 @@ #ifndef _XE_GSC_TYPES_H_ #define _XE_GSC_TYPES_H_ +#include <linux/workqueue.h> + #include "xe_uc_fw_types.h" +struct xe_bo; +struct xe_exec_queue; + /** * struct xe_gsc - GSC */ @@ -17,6 +22,18 @@ struct xe_gsc { /** @security_version: SVN found in the fetched blob */ u32 security_version; + + /** @private: Private data for use by the GSC FW */ + struct xe_bo *private; + + /** @q: Default queue used for submissions to GSC FW */ + struct xe_exec_queue *q; + + /** @wq: workqueue to handle jobs for delayed load and proxy handling */ + struct workqueue_struct *wq; + + /** @work: delayed load and proxy handling work */ + struct work_struct work; }; #endif diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c index b67154c78dff..15dcd1f91e9c 100644 --- a/drivers/gpu/drm/xe/xe_uc.c +++ b/drivers/gpu/drm/xe/xe_uc.c @@ -74,11 +74,17 @@ err: */ int xe_uc_init_post_hwconfig(struct xe_uc *uc) { + int err; + /* GuC submission not enabled, nothing to do */ if (!xe_device_uc_enabled(uc_to_xe(uc))) return 0; - return xe_guc_init_post_hwconfig(&uc->guc); + err = xe_guc_init_post_hwconfig(&uc->guc); + if (err) + return err; + + return xe_gsc_init_post_hwconfig(&uc->gsc); } static int uc_reset(struct xe_uc *uc) @@ -173,6 +179,9 @@ int xe_uc_init_hw(struct xe_uc *uc) ret = xe_huc_auth(&uc->huc); xe_gt_assert(uc_to_gt(uc), !ret); + /* GSC load is async */ + xe_gsc_load_start(&uc->gsc); + return 0; } @@ -197,6 +206,7 @@ void xe_uc_gucrc_disable(struct xe_uc *uc) void xe_uc_stop_prepare(struct xe_uc *uc) { + xe_gsc_wait_for_worker_completion(&uc->gsc); xe_guc_stop_prepare(&uc->guc); } |