From e89bd3615bc0883adc90209c1aac6d4bac7d221f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Oct 2024 22:54:38 -0400 Subject: drm/amdgpu/mes: fetch fw version from firmware header We need this prior to the firmware being loaded so fetch from the header. v2: fetch directly from the firmware v3: store both fw versions Reviewed-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 96788c0f42f1..0684e482a204 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -75,6 +75,7 @@ struct amdgpu_mes { uint32_t sched_version; uint32_t kiq_version; + uint32_t fw_version[AMDGPU_MAX_MES_PIPES]; bool enable_legacy_queue_map; uint32_t total_max_queue; -- cgit From f4a3246a2c7a595161f1ba11db53639b7f580104 Mon Sep 17 00:00:00 2001 From: chongli2 Date: Wed, 6 Nov 2024 11:43:09 +0800 Subject: drm/amdgpu: fix return random value when multiple threads read registers via mes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The currect code use the address "adev->mes.read_val_ptr" to store the value read from register via mes. So when multiple threads read register, multiple threads have to share the one address, and overwrite the value each other. Assign an address by "amdgpu_device_wb_get" to store register value. each thread will has an address to store register value. Signed-off-by: chongli2 Reviewed-by: Emily Deng Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 30 +++++++++++++----------------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 3 --- 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index b10383f83d73..f702b8b9c318 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -192,17 +192,6 @@ int amdgpu_mes_init(struct amdgpu_device *adev) (uint64_t *)&adev->wb.wb[adev->mes.query_status_fence_offs[i]]; } - r = amdgpu_device_wb_get(adev, &adev->mes.read_val_offs); - if (r) { - dev_err(adev->dev, - "(%d) read_val_offs alloc failed\n", r); - goto error; - } - adev->mes.read_val_gpu_addr = - adev->wb.gpu_addr + (adev->mes.read_val_offs * 4); - adev->mes.read_val_ptr = - (uint32_t *)&adev->wb.wb[adev->mes.read_val_offs]; - r = amdgpu_mes_doorbell_init(adev); if (r) goto error; @@ -223,8 +212,6 @@ error: amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs[i]); } - if (adev->mes.read_val_ptr) - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); idr_destroy(&adev->mes.pasid_idr); idr_destroy(&adev->mes.gang_id_idr); @@ -249,8 +236,6 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) amdgpu_device_wb_free(adev, adev->mes.query_status_fence_offs[i]); } - if (adev->mes.read_val_ptr) - amdgpu_device_wb_free(adev, adev->mes.read_val_offs); amdgpu_mes_doorbell_free(adev); @@ -921,10 +906,19 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) { struct mes_misc_op_input op_input; int r, val = 0; + uint32_t addr_offset = 0; + uint64_t read_val_gpu_addr; + uint32_t *read_val_ptr; + if (amdgpu_device_wb_get(adev, &addr_offset)) { + DRM_ERROR("critical bug! too many mes readers\n"); + goto error; + } + read_val_gpu_addr = adev->wb.gpu_addr + (addr_offset * 4); + read_val_ptr = (uint32_t *)&adev->wb.wb[addr_offset]; op_input.op = MES_MISC_OP_READ_REG; op_input.read_reg.reg_offset = reg; - op_input.read_reg.buffer_addr = adev->mes.read_val_gpu_addr; + op_input.read_reg.buffer_addr = read_val_gpu_addr; if (!adev->mes.funcs->misc_op) { DRM_ERROR("mes rreg is not supported!\n"); @@ -935,9 +929,11 @@ uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) if (r) DRM_ERROR("failed to read reg (0x%x)\n", reg); else - val = *(adev->mes.read_val_ptr); + val = *(read_val_ptr); error: + if (addr_offset) + amdgpu_device_wb_free(adev, addr_offset); return val; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 0684e482a204..129b8d1cacef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -120,9 +120,6 @@ struct amdgpu_mes { uint32_t query_status_fence_offs[AMDGPU_MAX_MES_PIPES]; uint64_t query_status_fence_gpu_addr[AMDGPU_MAX_MES_PIPES]; uint64_t *query_status_fence_ptr[AMDGPU_MAX_MES_PIPES]; - uint32_t read_val_offs; - uint64_t read_val_gpu_addr; - uint32_t *read_val_ptr; uint32_t saved_flags; -- cgit From 92fd1714ee3cef8ad9c466ced354ab0581ee3782 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Thu, 17 Oct 2024 10:35:02 -0400 Subject: drm/amd/amdgpu: Increase MES log buffer to dump mes scratch data MES internal scratch data is useful for mes debug, it can only located in VRAM, change the allocation type and increase size for mes 11 Signed-off-by: shaoyunl Acked-by: Feifei Xu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 12 +++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index f702b8b9c318..c9ec5c6cad2e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -104,7 +104,7 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev) return 0; r = amdgpu_bo_create_kernel(adev, adev->mes.event_log_size, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_GTT, + AMDGPU_GEM_DOMAIN_VRAM, &adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 129b8d1cacef..0666ba91be15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -40,6 +40,7 @@ #define AMDGPU_MES_VERSION_MASK 0x00000fff #define AMDGPU_MES_API_VERSION_MASK 0x00fff000 #define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000 +#define AMDGPU_MES_MSCRATCH_SIZE 0x8000 enum amdgpu_mes_priority_level { AMDGPU_MES_PRIORITY_LEVEL_LOW = 0, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 0e758ebf2372..6f6133496944 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -908,6 +908,16 @@ static void mes_v11_0_enable(struct amdgpu_device *adev, bool enable) uint32_t pipe, data = 0; if (enable) { + if (amdgpu_mes_log_enable) { + WREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_LO, + lower_32_bits(adev->mes.event_log_gpu_addr + AMDGPU_MES_LOG_BUFFER_SIZE)); + WREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_HI, + upper_32_bits(adev->mes.event_log_gpu_addr + AMDGPU_MES_LOG_BUFFER_SIZE)); + dev_info(adev->dev, "Setup CP MES MSCRATCH address : 0x%x. 0x%x\n", + RREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_HI), + RREG32_SOC15(GC, 0, regCP_MES_MSCRATCH_LO)); + } + data = RREG32_SOC15(GC, 0, regCP_MES_CNTL); data = REG_SET_FIELD(data, CP_MES_CNTL, MES_PIPE0_RESET, 1); data = REG_SET_FIELD(data, CP_MES_CNTL, @@ -1370,7 +1380,7 @@ static int mes_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->mes.kiq_hw_init = &mes_v11_0_kiq_hw_init; adev->mes.kiq_hw_fini = &mes_v11_0_kiq_hw_fini; - adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE; + adev->mes.event_log_size = AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE; r = amdgpu_mes_init(adev); if (r) -- cgit From 8521e3c5f0585cad3e73e4ba73535dc274e7eba6 Mon Sep 17 00:00:00 2001 From: Shaoyun Liu Date: Wed, 23 Oct 2024 11:12:00 -0400 Subject: drm/amd/amdgpu: limit single process inside MES This is for MES to limit only one process for the user queues Signed-off-by: Shaoyun Liu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 23 +++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 15 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 11 +++++++++++ 5 files changed, 70 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 76093793839e..f57cc72c43cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1598,9 +1598,11 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, if (adev->enforce_isolation[i] && !partition_values[i]) { /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); + amdgpu_mes_set_enforce_isolation(adev, i, false); } else if (!adev->enforce_isolation[i] && partition_values[i]) { /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); + amdgpu_mes_set_enforce_isolation(adev, i, true); } adev->enforce_isolation[i] = partition_values[i]; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index c9ec5c6cad2e..59ec20b07a6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1678,6 +1678,29 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) return is_supported; } +/* Fix me -- node_id is used to identify the correct MES instances in the future */ +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) +{ + struct mes_misc_op_input op_input = {0}; + int r; + + op_input.op = MES_MISC_OP_CHANGE_CONFIG; + op_input.change_config.option.limit_single_process = enable ? 1 : 0; + + if (!adev->mes.funcs->misc_op) { + dev_err(adev->dev, "mes change config is not supported!\n"); + r = -EINVAL; + goto error; + } + + r = adev->mes.funcs->misc_op(&adev->mes, &op_input); + if (r) + dev_err(adev->dev, "failed to change_config.\n"); + +error: + return r; +} + #if defined(CONFIG_DEBUG_FS) static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 0666ba91be15..c6f93cbd6739 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -309,6 +309,7 @@ enum mes_misc_opcode { MES_MISC_OP_WRM_REG_WAIT, MES_MISC_OP_WRM_REG_WR_WAIT, MES_MISC_OP_SET_SHADER_DEBUGGER, + MES_MISC_OP_CHANGE_CONFIG, }; struct mes_misc_op_input { @@ -347,6 +348,21 @@ struct mes_misc_op_input { uint32_t tcp_watch_cntl[4]; uint32_t trap_en; } set_shader_debugger; + + struct { + union { + struct { + uint32_t limit_single_process : 1; + uint32_t enable_hws_logging_buffer : 1; + uint32_t reserved : 30; + }; + uint32_t all; + } option; + struct { + uint32_t tdr_level; + uint32_t tdr_delay; + } tdr_config; + } change_config; }; }; @@ -517,4 +533,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) } bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); + +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable); + #endif /* __AMDGPU_MES_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 6f6133496944..9c905b9e9376 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -644,6 +644,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes, sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl)); misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en; break; + case MES_MISC_OP_CHANGE_CONFIG: + if ((mes->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) < 0x63) { + dev_err(mes->adev->dev, "MES FW versoin must be larger than 0x63 to support limit single process feature.\n"); + return -EINVAL; + } + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG; + misc_pkt.change_config.opcode = + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS; + misc_pkt.change_config.option.bits.limit_single_process = + input->change_config.option.limit_single_process; + break; + default: DRM_ERROR("unsupported misc op (%d) \n", input->op); return -EINVAL; @@ -708,6 +720,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes->event_log_gpu_addr; } + if (enforce_isolation) + mes_set_hw_res_pkt.limit_single_process = 1; + return mes_v11_0_submit_pkt_and_poll_completion(mes, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 0c401a5e2fb8..9ecc5d61e49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -531,6 +531,14 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes, sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl)); misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en; break; + case MES_MISC_OP_CHANGE_CONFIG: + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG; + misc_pkt.change_config.opcode = + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS; + misc_pkt.change_config.option.bits.limit_single_process = + input->change_config.option.limit_single_process; + break; + default: DRM_ERROR("unsupported misc op (%d) \n", input->op); return -EINVAL; @@ -624,6 +632,9 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr + pipe * AMDGPU_MES_LOG_BUFFER_SIZE; } + if (enforce_isolation) + mes_set_hw_res_pkt.limit_single_process = 1; + return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), offsetof(union MESAPI_SET_HW_RESOURCES, api_status)); -- cgit