diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
25 files changed, 4264 insertions, 599 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 7bb0bc0ca3d6..a317e76ffb5e 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -1,4 +1,24 @@ -# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2017 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# # # Makefile for Heterogenous System Architecture support for AMD GPU devices # @@ -15,6 +35,8 @@ amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ kfd_process_queue_manager.o kfd_device_queue_manager.o \ kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ - kfd_dbgdev.o kfd_dbgmgr.o + kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o + +amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o obj-$(CONFIG_HSA_AMD) += amdkfd.o diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm new file mode 100644 index 000000000000..997a383dcb8b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -0,0 +1,1384 @@ +/* + * Copyright 2015-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#if 0 +HW (VI) source code for CWSR trap handler +#Version 18 + multiple trap handler + +// this performance-optimal version was originally from Seven Xu at SRDC + +// Revison #18 --... +/* Rev History +** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) +** #4. SR Memory Layout: +** 1. VGPR-SGPR-HWREG-{LDS} +** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. +** #5. Update: 1. Accurate g8sr_ts_save_d timestamp +** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) +** #7. Update: 1. don't barrier if noLDS +** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version +** 2. Fix SQ issue by s_sleep 2 +** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last +** 2. optimize s_buffer save by burst 16sgprs... +** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. +** #11. Update 1. Add 2 more timestamp for debug version +** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance +** #13. Integ 1. Always use MUBUF for PV trap shader... +** #14. Update 1. s_buffer_store soft clause... +** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. +** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree +** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] +** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... +** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 +** 2. FUNC - Handle non-CWSR traps +*/ + +var G8SR_WDMEM_HWREG_OFFSET = 0 +var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes + +// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. + +var G8SR_DEBUG_TIMESTAMP = 0 +var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset +var s_g8sr_ts_save_s = s[34:35] // save start +var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi +var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ +var s_g8sr_ts_save_d = s[40:41] // save end +var s_g8sr_ts_restore_s = s[42:43] // restore start +var s_g8sr_ts_restore_d = s[44:45] // restore end + +var G8SR_VGPR_SR_IN_DWX4 = 0 +var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes +var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 + + +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 1 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing + +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_RCNT_SIZE = 4 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + + //tba_lo and tba_hi need to be saved/restored +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_xnack_mask_lo = ttmp6 +var s_save_xnack_mask_hi = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 + +var s_save_mem_offset = tma_lo +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +var s_save_m0 = tma_hi + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp2 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 //tba_lo/hi need to be restored +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp7 + +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = tma_lo //no conflict +var s_restore_exec_hi = tma_hi //no conflict +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask_lo = xnack_mask_lo +var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ +/* Shader Main*/ + +shader main + asic(VI) + type(CS) + + + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* +if (!EMU_RUN_HACK) + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ + s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 + s_waitcnt lgkmcnt(0) + s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_rfe_b64 [ttmp0, ttmp1] +end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? +end + + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 //clear mem_viol bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_sq_save_msg + s_waitcnt lgkmcnt(0) +end + + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + L_SLEEP: + s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_spi_wrexec + s_waitcnt lgkmcnt(0) +end + + /* setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + + + + + /* save HW registers */ + ////////////////////////////// + + L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO + s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI + end + + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS + + write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset) //TBA_LO + write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset) //TBA_HI + + + + /* the first wave in the threadgroup */ + // save fist_wave bits in tba_hi unused bit.26 + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit + //s_or_b32 tba_hi, s_save_tmp, tba_hi // save first wave bit to tba_hi.bits[26] + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 + s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? + // restore s_save_buf_rsrc0,1 + //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo + s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo + + + + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + ///////////////////////////////////////////////////////////////////////////////////// + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +end + + + + /* save LDS */ + ////////////////////////////// + + L_SAVE_LDS: + + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + //s_and_b32 s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset) + get_sgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + +var LDS_DMA_ENABLE = 0 +var UNROLL = 0 +if UNROLL==0 && LDS_DMA_ENABLE==1 + s_mov_b32 s3, 256*2 + s_nop 0 + s_nop 0 + s_nop 0 + L_SAVE_LDS_LOOP: + //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? + if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + end + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? + +elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + // store from higest LDS address to lowest + s_mov_b32 s3, 256*2 + s_sub_u32 m0, s_save_alloc_size, s3 + s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 + s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... + s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest + s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction + s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc + s_nop 0 + s_nop 0 + s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes + s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved + s_add_u32 s0, s0,s_save_alloc_size + s_addc_u32 s1, s1, 0 + s_setpc_b64 s[0:1] + + + for var i =0; i< 128; i++ + // be careful to make here a 64Byte aligned address, which could improve performance... + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW + buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW + + if i!=127 + s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline + s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 + end + end + +else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 + s_mov_b32 s0, s_save_buf_rsrc3 + s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT + +L_SAVE_LDS_LOOP_VECTOR: + ds_read_b64 v[0:1], v2 //x =LDS[a], byte address + s_waitcnt lgkmcnt(0) + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 +// s_waitcnt vmcnt(0) + v_add_u32 v2, vcc[0:1], v2, v3 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR + + // restore rsrc3 + s_mov_b32 s_save_buf_rsrc3, s0 + +end + +L_SAVE_LDS_DONE: + + + /* save VGPRs - set the Rest VGPRs */ + ////////////////////////////////////////////////////////////////////////////////////// + L_SAVE_VGPR: + // VGPR SR memory offset: 0 + // TODO rearrange the RSRC words to use swizzle for VGPR save... + + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + + // VGPR Allocated in 4-GPR granularity + +if G8SR_VGPR_SR_IN_DWX4 + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, 4 // skip first 4 VGPRs + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs + + s_set_gpr_idx_on m0, 0x1 // This will change M0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 +L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 // v0 = v[0+m0] + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + + + buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +L_SAVE_VGPR_LOOP_END: + + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes +else + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =0 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + + + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] + v_mov_b32 v2, v2 //v0 = v[0+m0] + v_mov_b32 v3, v3 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + end + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? + s_set_gpr_idx_off +end + +L_SAVE_VGPR_END: + + + + + + + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + +// Save Done timestamp +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_save_d + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_save_mem_offset) + s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // Need reset rsrc2?? + s_mov_b32 m0, s_save_mem_offset + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 +end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_s + s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? + // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... + s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] + s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. +end + + + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + + /* global mem offset */ +// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? + + + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_mov_b32 m0, 0x0 //lds_offset initial value = 0 + + L_RESTORE_LDS_LOOP: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW + end + s_add_u32 m0, m0, 256*2 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? + + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_mov_b32 exec_hi, 0xFFFFFFFF + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + +if G8SR_VGPR_SR_IN_DWX4 + get_vgpr_size_bytes(s_restore_mem_offset) + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + + // the const stride for DWx4 is 4*4 bytes + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes + + s_mov_b32 m0, s_restore_alloc_size + s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 + +L_RESTORE_VGPR_LOOP: + buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + s_sub_u32 m0, m0, 4 + v_mov_b32 v0, v0 // v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_cmp_eq_u32 m0, 0x8000 + s_cbranch_scc0 L_RESTORE_VGPR_LOOP + s_set_gpr_idx_off + + s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes + +else + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 1 + s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later + + L_RESTORE_VGPR_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + end + s_waitcnt vmcnt(0) //ensure data ready + v_mov_b32 v0, v0 //v[0+m0] = v0 + v_mov_b32 v1, v1 + v_mov_b32 v2, v2 + v_mov_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? + s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + end + +end + + /* restore SGPRs */ + ////////////////////////////// + + // SGPR SR memory offset : size(VGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group + // TODO, change RSRC word to rearrange memory layout for SGPRS + + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) + end + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), + However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG + */ + s_mov_b32 m0, s_restore_alloc_size + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + + +if G8SR_DEBUG_TIMESTAMP + s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo + s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi +end + + // HWREG SR memory offset : size(VGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset) + get_sgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS + read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO + read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE + read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_LO + read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //TBA_HI + + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + +if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d + s_waitcnt lgkmcnt(0) +end + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ + +//Only for save hwreg to mem +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + + +// HWREG are saved before SGPRs, so all HWREG could be use. +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 +end + + + +function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible +end + +function get_sgpr_size_bytes(s_sgpr_size_byte) + s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 + s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) +end + +function get_hwreg_size_bytes + return 128 //HWREG size 128 bytes +end + + +#endif + +static const uint32_t cwsr_trap_gfx8_hex[] = { + 0xbf820001, 0xbf820123, + 0xb8f4f802, 0x89748674, + 0xb8f5f803, 0x8675ff75, + 0x00000400, 0xbf850011, + 0xc00a1e37, 0x00000000, + 0xbf8c007f, 0x87777978, + 0xbf840002, 0xb974f802, + 0xbe801d78, 0xb8f5f803, + 0x8675ff75, 0x000001ff, + 0xbf850002, 0x80708470, + 0x82718071, 0x8671ff71, + 0x0000ffff, 0xb974f802, + 0xbe801f70, 0xb8f5f803, + 0x8675ff75, 0x00000100, + 0xbf840006, 0xbefa0080, + 0xb97a0203, 0x8671ff71, + 0x0000ffff, 0x80f08870, + 0x82f18071, 0xbefa0080, + 0xb97a0283, 0xbef60068, + 0xbef70069, 0xb8fa1c07, + 0x8e7a9c7a, 0x87717a71, + 0xb8fa03c7, 0x8e7a9b7a, + 0x87717a71, 0xb8faf807, + 0x867aff7a, 0x00007fff, + 0xb97af807, 0xbef2007e, + 0xbef3007f, 0xbefe0180, + 0xbf900004, 0xbf8e0002, + 0xbf88fffe, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x877b7a7b, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x877b7a7b, 0xbeef007c, + 0xbeee0080, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cbc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611d3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xb8f5f803, + 0xbefe007c, 0xbefc006e, + 0xc0611d7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xb8eff801, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0x867aff7f, + 0x04000000, 0xbef30080, + 0x8773737a, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8f51605, 0x80758175, + 0x8e758475, 0x8e7a8275, + 0xbefa00ff, 0x01000000, + 0xbef60178, 0x80786e78, + 0x82798079, 0xbefc0080, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003c, 0x00000000, + 0xc06b013c, 0x00000010, + 0xc06b023c, 0x00000020, + 0xc06b033c, 0x00000030, + 0x8078c078, 0x82798079, + 0x807c907c, 0xbf0a757c, + 0xbf85ffeb, 0xbef80176, + 0xbeee0080, 0xbefe00c1, + 0xbeff00c1, 0xbefa00ff, + 0x01000000, 0xe0724000, + 0x6e1e0000, 0xe0724100, + 0x6e1e0100, 0xe0724200, + 0x6e1e0200, 0xe0724300, + 0x6e1e0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f54306, + 0x8675c175, 0xbf84002c, + 0xbf8a0000, 0x867aff73, + 0x04000000, 0xbf840028, + 0x8e758675, 0x8e758275, + 0xbefa0075, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0x806eff6e, 0x00000080, + 0xbefa00ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe80007b, + 0x867bff7b, 0xff7fffff, + 0x877bff7b, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8c007f, 0xe0765000, + 0x6e1e0002, 0x32040702, + 0xd0c9006a, 0x0000eb02, + 0xbf87fff7, 0xbefb0000, + 0xbeee00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f52a05, 0x80758175, + 0x8e758275, 0x8e7a8875, + 0xbefa00ff, 0x01000000, + 0xbefc0084, 0xbf0a757c, + 0xbf840015, 0xbf11017c, + 0x8075ff75, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x6e1e0000, + 0xe0724100, 0x6e1e0100, + 0xe0724200, 0x6e1e0200, + 0xe0724300, 0x6e1e0300, + 0x807c847c, 0x806eff6e, + 0x00000400, 0xbf0a757c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200ca, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x877b767b, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x877b767b, 0x8676ff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8f34306, 0x8673c173, + 0xbf840019, 0x8e738673, + 0x8e738273, 0xbefa0073, + 0xb8f22a05, 0x80728172, + 0x8e728a72, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x80727672, 0x8072ff72, + 0x00000080, 0xbefa00ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x721e0000, + 0xe0510100, 0x721e0000, + 0x807cff7c, 0x00000200, + 0x8072ff72, 0x00000200, + 0xbf0a737c, 0xbf85fff6, + 0xbef20080, 0xbefe00c1, + 0xbeff00c1, 0xb8f32a05, + 0x80738173, 0x8e738273, + 0x8e7a8873, 0xbefa00ff, + 0x01000000, 0xbef60072, + 0x8072ff72, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x8073ff73, 0x00008000, + 0xe0524000, 0x721e0000, + 0xe0524100, 0x721e0100, + 0xe0524200, 0x721e0200, + 0xe0524300, 0x721e0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8072ff72, 0x00000400, + 0xbf0a737c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x761e0000, 0xe0524100, + 0x761e0100, 0xe0524200, + 0x761e0200, 0xe0524300, + 0x761e0300, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0x80f2c072, 0xb8f31605, + 0x80738173, 0x8e738473, + 0x8e7a8273, 0xbefa00ff, + 0x01000000, 0xbefc0073, + 0xc031003c, 0x00000072, + 0x80f2c072, 0xbf8c007f, + 0x80fc907c, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff1, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xc0211cfc, + 0x00000072, 0x80728472, + 0xc0211c3c, 0x00000072, + 0x80728472, 0xc0211c7c, + 0x00000072, 0x80728472, + 0xc0211bbc, 0x00000072, + 0x80728472, 0xc0211bfc, + 0x00000072, 0x80728472, + 0xc0211d3c, 0x00000072, + 0x80728472, 0xc0211d7c, + 0x00000072, 0x80728472, + 0xc0211a3c, 0x00000072, + 0x80728472, 0xc0211a7c, + 0x00000072, 0x80728472, + 0xc0211dfc, 0x00000072, + 0x80728472, 0xc0211b3c, + 0x00000072, 0x80728472, + 0xc0211b7c, 0x00000072, + 0x80728472, 0xbf8c007f, + 0x8671ff71, 0x0000ffff, + 0xbefc0073, 0xbefe006e, + 0xbeff006f, 0x867375ff, + 0x000003ff, 0xb9734803, + 0x867375ff, 0xfffff800, + 0x8f738b73, 0xb973a2c3, + 0xb977f801, 0x8673ff71, + 0xf0000000, 0x8f739c73, + 0x8e739073, 0xbef60080, + 0x87767376, 0x8673ff71, + 0x08000000, 0x8f739b73, + 0x8e738f73, 0x87767376, + 0x8673ff74, 0x00800000, + 0x8f739773, 0xb976f807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb974f802, 0xbf8a0000, + 0x95807370, 0xbf810000, +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 505d39156acd..62c3d9cd6ef1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep) return -EPERM; } - process = kfd_create_process(current); + process = kfd_create_process(filep); if (IS_ERR(process)) return PTR_ERR(process); @@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->ctx_save_restore_area_address = args->ctx_save_restore_address; q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size; + q_properties->ctl_stack_size = args->ctl_stack_size; if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE || args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) q_properties->type = KFD_QUEUE_TYPE_COMPUTE; @@ -431,6 +432,38 @@ out: return err; } +static int kfd_ioctl_set_trap_handler(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_trap_handler_args *args = data; + struct kfd_dev *dev; + int err = 0; + struct kfd_process_device *pdd; + + dev = kfd_device_by_id(args->gpu_id); + if (dev == NULL) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = -ESRCH; + goto out; + } + + if (dev->dqm->ops.set_trap_handler(dev->dqm, + &pdd->qpd, + args->tba_addr, + args->tma_addr)) + err = -EINVAL; + +out: + mutex_unlock(&p->mutex); + + return err; +} + static int kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) { @@ -493,7 +526,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, long status; dev = kfd_device_by_id(args->gpu_id); - if (!dev) + if (!dev || !dev->dbgmgr) return -EINVAL; if (dev->device_info->asic_family == CHIP_CARRIZO) { @@ -979,7 +1012,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { kfd_ioctl_set_scratch_backing_va, 0), AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, - kfd_ioctl_get_tile_config, 0) + kfd_ioctl_get_tile_config, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, + kfd_ioctl_set_trap_handler, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -1088,6 +1124,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) KFD_MMAP_EVENTS_MASK) { vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; return kfd_event_mmap(process, vma); + } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == + KFD_MMAP_RESERVED_MEM_MASK) { + vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; + return kfd_reserved_mem_mmap(process, vma); } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c new file mode 100644 index 000000000000..2bc2816767a7 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -0,0 +1,1267 @@ +/* + * Copyright 2015-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/amd-iommu.h> +#include "kfd_crat.h" +#include "kfd_priv.h" +#include "kfd_topology.h" + +/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. + * GPU processor ID are expressed with Bit[31]=1. + * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs + * used in the CRAT. + */ +static uint32_t gpu_processor_id_low = 0x80001000; + +/* Return the next available gpu_processor_id and increment it for next GPU + * @total_cu_count - Total CUs present in the GPU including ones + * masked off + */ +static inline unsigned int get_and_inc_gpu_processor_id( + unsigned int total_cu_count) +{ + int current_id = gpu_processor_id_low; + + gpu_processor_id_low += total_cu_count; + return current_id; +} + +/* Static table to describe GPU Cache information */ +struct kfd_gpu_cache_info { + uint32_t cache_size; + uint32_t cache_level; + uint32_t flags; + /* Indicates how many Compute Units share this cache + * Value = 1 indicates the cache is not shared + */ + uint32_t num_cu_shared; +}; + +static struct kfd_gpu_cache_info kaveri_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + + /* TODO: Add L2 Cache information */ +}; + + +static struct kfd_gpu_cache_info carrizo_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + }, + { + /* Scalar L1 Instruction Cache (in SQC module) per bank */ + .cache_size = 8, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + { + /* Scalar L1 Data Cache (in SQC module) per bank. */ + .cache_size = 4, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 4, + }, + + /* TODO: Add L2 Cache information */ +}; + +/* NOTE: In future if more information is added to struct kfd_gpu_cache_info + * the following ASICs may need a separate table. + */ +#define hawaii_cache_info kaveri_cache_info +#define tonga_cache_info carrizo_cache_info +#define fiji_cache_info carrizo_cache_info +#define polaris10_cache_info carrizo_cache_info +#define polaris11_cache_info carrizo_cache_info + +static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.cpu_cores_count = cu->num_cpu_cores; + dev->node_props.cpu_core_id_base = cu->processor_id_low; + if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + + pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, + cu->processor_id_low); +} + +static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +{ + dev->node_props.simd_id_base = cu->processor_id_low; + dev->node_props.simd_count = cu->num_simd_cores; + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; + dev->node_props.array_count = cu->array_count; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; + if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) + dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; + pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low); +} + +/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, + struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", + cu->proximity_domain, cu->hsa_capability); + list_for_each_entry(dev, device_list, list) { + if (cu->proximity_domain == dev->proximity_domain) { + if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) + kfd_populated_cu_info_cpu(dev, cu); + + if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) + kfd_populated_cu_info_gpu(dev, cu); + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, + struct list_head *device_list) +{ + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; + + pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->proximity_domain); + list_for_each_entry(dev, device_list, list) { + if (mem->proximity_domain == dev->proximity_domain) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + /* We're on GPU node */ + if (dev->node_props.cpu_cores_count == 0) { + /* APU */ + if (mem->visibility_type == 0) + props->heap_type = + HSA_MEM_HEAP_TYPE_FB_PRIVATE; + /* dGPU */ + else + props->heap_type = mem->visibility_type; + } else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) + props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) + props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; + + props->size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; + props->width = mem->width; + + dev->node_props.mem_banks_count++; + list_add_tail(&props->list, &dev->mem_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + struct list_head *device_list) +{ + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; + uint32_t total_num_of_cu; + + id = cache->processor_id_low; + + pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, device_list, list) { + total_num_of_cu = (dev->node_props.array_count * + dev->node_props.cu_per_simd_array); + + /* Cache infomration in CRAT doesn't have proximity_domain + * information as it is associated with a CPU core or GPU + * Compute Unit. So map the cache using CPU core Id or SIMD + * (GPU) ID. + * TODO: This works because currently we can safely assume that + * Compute Units are parsed before caches are parsed. In + * future, remove this dependency + */ + if ((id >= dev->node_props.cpu_core_id_base && + id <= dev->node_props.cpu_core_id_base + + dev->node_props.cpu_cores_count) || + (id >= dev->node_props.simd_id_base && + id < dev->node_props.simd_id_base + + total_num_of_cu)) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + props->processor_id_low = id; + props->cache_level = cache->cache_level; + props->cache_size = cache->cache_size; + props->cacheline_size = cache->cache_line_size; + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; + memcpy(props->sibling_map, cache->sibling_map, + sizeof(props->sibling_map)); + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; + if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) + props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; + if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) + props->cache_type |= HSA_CACHE_TYPE_CPU; + if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) + props->cache_type |= HSA_CACHE_TYPE_HSACU; + + dev->cache_count++; + dev->node_props.caches_count++; + list_add_tail(&props->list, &dev->cache_props); + + break; + } + } + + return 0; +} + +/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct + * topology device present in the device_list + */ +static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + struct list_head *device_list) +{ + struct kfd_iolink_properties *props = NULL, *props2; + struct kfd_topology_device *dev, *cpu_dev; + uint32_t id_from; + uint32_t id_to; + + id_from = iolink->proximity_domain_from; + id_to = iolink->proximity_domain_to; + + pr_debug("Found IO link entry in CRAT table with id_from=%d\n", + id_from); + list_for_each_entry(dev, device_list, list) { + if (id_from == dev->proximity_domain) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + + props->node_from = id_from; + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; + props->iolink_type = iolink->io_interface_type; + + if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) + props->weight = 20; + else + props->weight = node_distance(id_from, id_to); + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; + props->min_bandwidth = iolink->minimum_bandwidth_mbs; + props->max_bandwidth = iolink->maximum_bandwidth_mbs; + props->rec_transfer_size = + iolink->recommended_transfer_size; + + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); + break; + } + } + + /* CPU topology is created before GPUs are detected, so CPU->GPU + * links are not built at that time. If a PCIe type is discovered, it + * means a GPU is detected and we are adding GPU->CPU to the topology. + * At this time, also add the corresponded CPU->GPU link. + */ + if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { + cpu_dev = kfd_topology_device_by_proximity_domain(id_to); + if (!cpu_dev) + return -ENODEV; + /* same everything but the other direction */ + props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); + props2->node_from = id_to; + props2->node_to = id_from; + props2->kobj = NULL; + cpu_dev->io_link_count++; + cpu_dev->node_props.io_links_count++; + list_add_tail(&props2->list, &cpu_dev->io_link_props); + } + + return 0; +} + +/* kfd_parse_subtype - parse subtypes and attach it to correct topology device + * present in the device_list + * @sub_type_hdr - subtype section of crat_image + * @device_list - list of topology devices present in this crat_image + */ +static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, + struct list_head *device_list) +{ + struct crat_subtype_computeunit *cu; + struct crat_subtype_memory *mem; + struct crat_subtype_cache *cache; + struct crat_subtype_iolink *iolink; + int ret = 0; + + switch (sub_type_hdr->type) { + case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + ret = kfd_parse_subtype_cu(cu, device_list); + break; + case CRAT_SUBTYPE_MEMORY_AFFINITY: + mem = (struct crat_subtype_memory *)sub_type_hdr; + ret = kfd_parse_subtype_mem(mem, device_list); + break; + case CRAT_SUBTYPE_CACHE_AFFINITY: + cache = (struct crat_subtype_cache *)sub_type_hdr; + ret = kfd_parse_subtype_cache(cache, device_list); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: + /* + * For now, nothing to do here + */ + pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: + iolink = (struct crat_subtype_iolink *)sub_type_hdr; + ret = kfd_parse_subtype_iolink(iolink, device_list); + break; + default: + pr_warn("Unknown subtype %d in CRAT\n", + sub_type_hdr->type); + } + + return ret; +} + +/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT + * create a kfd_topology_device and add in to device_list. Also parse + * CRAT subtypes and attach it to appropriate kfd_topology_device + * @crat_image - input image containing CRAT + * @device_list - [OUT] list of kfd_topology_device generated after + * parsing crat_image + * @proximity_domain - Proximity domain of the first device in the table + * + * Return - 0 if successful else -ve value + */ +int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, + uint32_t proximity_domain) +{ + struct kfd_topology_device *top_dev = NULL; + struct crat_subtype_generic *sub_type_hdr; + uint16_t node_id; + int ret = 0; + struct crat_header *crat_table = (struct crat_header *)crat_image; + uint16_t num_nodes; + uint32_t image_len; + + if (!crat_image) + return -EINVAL; + + if (!list_empty(device_list)) { + pr_warn("Error device list should be empty\n"); + return -EINVAL; + } + + num_nodes = crat_table->num_domains; + image_len = crat_table->length; + + pr_info("Parsing CRAT table with %d nodes\n", num_nodes); + + for (node_id = 0; node_id < num_nodes; node_id++) { + top_dev = kfd_create_topology_device(device_list); + if (!top_dev) + break; + top_dev->proximity_domain = proximity_domain++; + } + + if (!top_dev) { + ret = -ENOMEM; + goto err; + } + + memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); + memcpy(top_dev->oem_table_id, crat_table->oem_table_id, + CRAT_OEMTABLEID_LENGTH); + top_dev->oem_revision = crat_table->oem_revision; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < + ((char *)crat_image) + image_len) { + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { + ret = kfd_parse_subtype(sub_type_hdr, device_list); + if (ret) + break; + } + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + } + +err: + if (ret) + kfd_release_topology_device_list(device_list); + + return ret; +} + +/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ +static int fill_in_pcache(struct crat_subtype_cache *pcache, + struct kfd_gpu_cache_info *pcache_info, + struct kfd_cu_info *cu_info, + int mem_available, + int cu_bitmask, + int cache_type, unsigned int cu_processor_id, + int cu_block) +{ + unsigned int cu_sibling_map_mask; + int first_active_cu; + + /* First check if enough memory is available */ + if (sizeof(struct crat_subtype_cache) > mem_available) + return -ENOMEM; + + cu_sibling_map_mask = cu_bitmask; + cu_sibling_map_mask >>= cu_block; + cu_sibling_map_mask &= + ((1 << pcache_info[cache_type].num_cu_shared) - 1); + first_active_cu = ffs(cu_sibling_map_mask); + + /* CU could be inactive. In case of shared cache find the first active + * CU. and incase of non-shared cache check if the CU is inactive. If + * inactive active skip it + */ + if (first_active_cu) { + memset(pcache, 0, sizeof(struct crat_subtype_cache)); + pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; + pcache->length = sizeof(struct crat_subtype_cache); + pcache->flags = pcache_info[cache_type].flags; + pcache->processor_id_low = cu_processor_id + + (first_active_cu - 1); + pcache->cache_level = pcache_info[cache_type].cache_level; + pcache->cache_size = pcache_info[cache_type].cache_size; + + /* Sibling map is w.r.t processor_id_low, so shift out + * inactive CU + */ + cu_sibling_map_mask = + cu_sibling_map_mask >> (first_active_cu - 1); + + pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); + pcache->sibling_map[1] = + (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); + pcache->sibling_map[2] = + (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); + pcache->sibling_map[3] = + (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); + return 0; + } + return 1; +} + +/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info + * tables + * + * @kdev - [IN] GPU device + * @gpu_processor_id - [IN] GPU processor ID to which these caches + * associate + * @available_size - [IN] Amount of memory available in pcache + * @cu_info - [IN] Compute Unit info obtained from KGD + * @pcache - [OUT] memory into which cache data is to be filled in. + * @size_filled - [OUT] amount of data used up in pcache. + * @num_of_entries - [OUT] number of caches added + */ +static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + int gpu_processor_id, + int available_size, + struct kfd_cu_info *cu_info, + struct crat_subtype_cache *pcache, + int *size_filled, + int *num_of_entries) +{ + struct kfd_gpu_cache_info *pcache_info; + int num_of_cache_types = 0; + int i, j, k; + int ct = 0; + int mem_available = available_size; + unsigned int cu_processor_id; + int ret; + + switch (kdev->device_info->asic_family) { + case CHIP_KAVERI: + pcache_info = kaveri_cache_info; + num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); + break; + case CHIP_HAWAII: + pcache_info = hawaii_cache_info; + num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); + break; + case CHIP_CARRIZO: + pcache_info = carrizo_cache_info; + num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); + break; + case CHIP_TONGA: + pcache_info = tonga_cache_info; + num_of_cache_types = ARRAY_SIZE(tonga_cache_info); + break; + case CHIP_FIJI: + pcache_info = fiji_cache_info; + num_of_cache_types = ARRAY_SIZE(fiji_cache_info); + break; + case CHIP_POLARIS10: + pcache_info = polaris10_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); + break; + case CHIP_POLARIS11: + pcache_info = polaris11_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); + break; + default: + return -EINVAL; + } + + *size_filled = 0; + *num_of_entries = 0; + + /* For each type of cache listed in the kfd_gpu_cache_info table, + * go through all available Compute Units. + * The [i,j,k] loop will + * if kfd_gpu_cache_info.num_cu_shared = 1 + * will parse through all available CU + * If (kfd_gpu_cache_info.num_cu_shared != 1) + * then it will consider only one CU from + * the shared unit + */ + + for (ct = 0; ct < num_of_cache_types; ct++) { + cu_processor_id = gpu_processor_id; + for (i = 0; i < cu_info->num_shader_engines; i++) { + for (j = 0; j < cu_info->num_shader_arrays_per_engine; + j++) { + for (k = 0; k < cu_info->num_cu_per_sh; + k += pcache_info[ct].num_cu_shared) { + + ret = fill_in_pcache(pcache, + pcache_info, + cu_info, + mem_available, + cu_info->cu_bitmap[i][j], + ct, + cu_processor_id, + k); + + if (ret < 0) + break; + + if (!ret) { + pcache++; + (*num_of_entries)++; + mem_available -= + sizeof(*pcache); + (*size_filled) += + sizeof(*pcache); + } + + /* Move to next CU block */ + cu_processor_id += + pcache_info[ct].num_cu_shared; + } + } + } + } + + pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); + + return 0; +} + +/* + * kfd_create_crat_image_acpi - Allocates memory for CRAT image and + * copies CRAT from ACPI (if available). + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: CRAT read from ACPI. If no CRAT in ACPI then + * crat_image will be NULL + * @size: [OUT] size of crat_image + * + * Return 0 if successful else return error code + */ +int kfd_create_crat_image_acpi(void **crat_image, size_t *size) +{ + struct acpi_table_header *crat_table; + acpi_status status; + void *pcrat_image; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* Fetch the CRAT table from ACPI */ + status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); + if (status == AE_NOT_FOUND) { + pr_warn("CRAT table not found\n"); + return -ENODATA; + } else if (ACPI_FAILURE(status)) { + const char *err = acpi_format_exception(status); + + pr_err("CRAT table error: %s\n", err); + return -EINVAL; + } + + if (ignore_crat) { + pr_info("CRAT table disabled by module option\n"); + return -ENODATA; + } + + pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + + memcpy(pcrat_image, crat_table, crat_table->length); + + *crat_image = pcrat_image; + *size = crat_table->length; + + return 0; +} + +/* Memory required to create Virtual CRAT. + * Since there is no easy way to predict the amount of memory required, the + * following amount are allocated for CPU and GPU Virtual CRAT. This is + * expected to cover all known conditions. But to be safe additional check + * is put in the code to ensure we don't overwrite. + */ +#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) +#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) + +/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_computeunit *sub_type_hdr) +{ + const struct cpumask *cpumask; + + *avail_size -= sizeof(struct crat_subtype_computeunit); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + cpumask = cpumask_of_node(numa_node_id); + + /* Fill in CU data */ + sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; + sub_type_hdr->proximity_domain = proximity_domain; + sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); + if (sub_type_hdr->processor_id_low == -1) + return -EINVAL; + + sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask); + + return 0; +} + +/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node + * + * @numa_node_id: CPU NUMA node id + * @avail_size: Available size in the memory + * @sub_type_hdr: Memory into which compute info will be filled in + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, + int proximity_domain, + struct crat_subtype_memory *sub_type_hdr) +{ + uint64_t mem_in_bytes = 0; + pg_data_t *pgdat; + int zone_type; + + *avail_size -= sizeof(struct crat_subtype_memory); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in Memory Subunit data */ + + /* Unlike si_meminfo, si_meminfo_node is not exported. So + * the following lines are duplicated from si_meminfo_node + * function + */ + pgdat = NODE_DATA(numa_node_id); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + mem_in_bytes += pgdat->node_zones[zone_type].managed_pages; + mem_in_bytes <<= PAGE_SHIFT; + + sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); + sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); + sub_type_hdr->proximity_domain = proximity_domain; + + return 0; +} + +static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, + uint32_t *num_entries, + struct crat_subtype_iolink *sub_type_hdr) +{ + int nid; + struct cpuinfo_x86 *c = &cpu_data(0); + uint8_t link_type; + + if (c->x86_vendor == X86_VENDOR_AMD) + link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; + else + link_type = CRAT_IOLINK_TYPE_QPI_1_1; + + *num_entries = 0; + + /* Create IO links from this node to other CPU nodes */ + for_each_online_node(nid) { + if (nid == numa_node_id) /* node itself */ + continue; + + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in IO link data */ + sub_type_hdr->proximity_domain_from = numa_node_id; + sub_type_hdr->proximity_domain_to = nid; + sub_type_hdr->io_interface_type = link_type; + + (*num_entries)++; + sub_type_hdr++; + } + + return 0; +} + +/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for CPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct acpi_table_header *acpi_table; + acpi_status status; + struct crat_subtype_generic *sub_type_hdr; + int avail_size = *size; + int numa_node_id; + uint32_t entries = 0; + int ret = 0; + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) + return -EINVAL; + + /* Fill in CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + memcpy(&crat_table->signature, CRAT_SIGNATURE, + sizeof(crat_table->signature)); + crat_table->length = sizeof(struct crat_header); + + status = acpi_get_table("DSDT", 0, &acpi_table); + if (status == AE_NOT_FOUND) + pr_warn("DSDT table not found for OEM information\n"); + else { + crat_table->oem_revision = acpi_table->revision; + memcpy(crat_table->oem_id, acpi_table->oem_id, + CRAT_OEMID_LENGTH); + memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, + CRAT_OEMTABLEID_LENGTH); + } + crat_table->total_entries = 0; + crat_table->num_domains = 0; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); + + for_each_online_node(numa_node_id) { + if (kfd_numa_node_to_apic_id(numa_node_id) == -1) + continue; + + /* Fill in Subtype: Compute Unit */ + ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_computeunit *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + /* Fill in Subtype: Memory */ + ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size, + crat_table->num_domains, + (struct crat_subtype_memory *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + /* Fill in Subtype: IO Link */ + ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, + &entries, + (struct crat_subtype_iolink *)sub_type_hdr); + if (ret < 0) + return ret; + crat_table->length += (sub_type_hdr->length * entries); + crat_table->total_entries += entries; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length * entries); + + crat_table->num_domains++; + } + + /* TODO: Add cache Subtype for CPU. + * Currently, CPU cache information is available in function + * detect_cache_attributes(cpu) defined in the file + * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not + * exported and to get the same information the code needs to be + * duplicated. + */ + + *size = crat_table->length; + pr_info("Virtual CRAT table created for CPU\n"); + + return 0; +} + +static int kfd_fill_gpu_memory_affinity(int *avail_size, + struct kfd_dev *kdev, uint8_t type, uint64_t size, + struct crat_subtype_memory *sub_type_hdr, + uint32_t proximity_domain, + const struct kfd_local_mem_info *local_mem_info) +{ + *avail_size -= sizeof(struct crat_subtype_memory); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); + sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_memory); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + sub_type_hdr->proximity_domain = proximity_domain; + + pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", + type, size); + + sub_type_hdr->length_low = lower_32_bits(size); + sub_type_hdr->length_high = upper_32_bits(size); + + sub_type_hdr->width = local_mem_info->vram_width; + sub_type_hdr->visibility_type = type; + + return 0; +} + +/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU + * to its NUMA node + * @avail_size: Available size in the memory + * @kdev - [IN] GPU device + * @sub_type_hdr: Memory into which io link info will be filled in + * @proximity_domain - proximity domain of the GPU node + * + * Return 0 if successful else return -ve value + */ +static int kfd_fill_gpu_direct_io_link(int *avail_size, + struct kfd_dev *kdev, + struct crat_subtype_iolink *sub_type_hdr, + uint32_t proximity_domain) +{ + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + /* Fill in subtype header data */ + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill in IOLINK subtype. + * TODO: Fill-in other fields of iolink subtype + */ + sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; + sub_type_hdr->proximity_domain_from = proximity_domain; +#ifdef CONFIG_NUMA + if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) + sub_type_hdr->proximity_domain_to = 0; + else + sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; +#else + sub_type_hdr->proximity_domain_to = 0; +#endif + return 0; +} + +/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU + * + * @pcrat_image: Fill in VCRAT for GPU + * @size: [IN] allocated size of crat_image. + * [OUT] actual size of data filled in crat_image + */ +static int kfd_create_vcrat_image_gpu(void *pcrat_image, + size_t *size, struct kfd_dev *kdev, + uint32_t proximity_domain) +{ + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct crat_subtype_generic *sub_type_hdr; + struct crat_subtype_computeunit *cu; + struct kfd_cu_info cu_info; + struct amd_iommu_device_info iommu_info; + int avail_size = *size; + uint32_t total_num_of_cu; + int num_of_cache_entries = 0; + int cache_mem_filled = 0; + int ret = 0; + const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | + AMD_IOMMU_DEVICE_FLAG_PRI_SUP | + AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + struct kfd_local_mem_info local_mem_info; + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) + return -EINVAL; + + /* Fill the CRAT Header. + * Modify length and total_entries as subunits are added. + */ + avail_size -= sizeof(struct crat_header); + if (avail_size < 0) + return -ENOMEM; + + memset(crat_table, 0, sizeof(struct crat_header)); + + memcpy(&crat_table->signature, CRAT_SIGNATURE, + sizeof(crat_table->signature)); + /* Change length as we add more subtypes*/ + crat_table->length = sizeof(struct crat_header); + crat_table->num_domains = 1; + crat_table->total_entries = 0; + + /* Fill in Subtype: Compute Unit + * First fill in the sub type header and then sub type data + */ + avail_size -= sizeof(struct crat_subtype_computeunit); + if (avail_size < 0) + return -ENOMEM; + + sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); + memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); + + sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); + sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; + + /* Fill CU subtype data */ + cu = (struct crat_subtype_computeunit *)sub_type_hdr; + cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; + cu->proximity_domain = proximity_domain; + + kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); + cu->num_simd_per_cu = cu_info.simd_per_cu; + cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; + cu->max_waves_simd = cu_info.max_waves_per_simd; + + cu->wave_front_size = cu_info.wave_front_size; + cu->array_count = cu_info.num_shader_arrays_per_engine * + cu_info.num_shader_engines; + total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); + cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); + cu->num_cu_per_array = cu_info.num_cu_per_sh; + cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; + cu->num_banks = cu_info.num_shader_engines; + cu->lds_size_in_kb = cu_info.lds_size; + + cu->hsa_capability = 0; + + /* Check if this node supports IOMMU. During parsing this flag will + * translate to HSA_CAP_ATS_PRESENT + */ + iommu_info.flags = 0; + if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { + if ((iommu_info.flags & required_iommu_flags) == + required_iommu_flags) + cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; + } + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + /* Fill in Subtype: Memory. Only on systems with large BAR (no + * private FB), report memory as public. On other systems + * report the total FB size (public+private) as a single + * private heap. + */ + kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + + if (local_mem_info.local_mem_size_private == 0) + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, + local_mem_info.local_mem_size_public, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + else + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, + local_mem_info.local_mem_size_public + + local_mem_info.local_mem_size_private, + (struct crat_subtype_memory *)sub_type_hdr, + proximity_domain, + &local_mem_info); + if (ret < 0) + return ret; + + crat_table->length += sizeof(struct crat_subtype_memory); + crat_table->total_entries++; + + /* TODO: Fill in cache information. This information is NOT readily + * available in KGD + */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, + avail_size, + &cu_info, + (struct crat_subtype_cache *)sub_type_hdr, + &cache_mem_filled, + &num_of_cache_entries); + + if (ret < 0) + return ret; + + crat_table->length += cache_mem_filled; + crat_table->total_entries += num_of_cache_entries; + avail_size -= cache_mem_filled; + + /* Fill in Subtype: IO_LINKS + * Only direct links are added here which is Link from GPU to + * to its NUMA node. Indirect links are added by userspace. + */ + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + cache_mem_filled); + ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, + (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); + + if (ret < 0) + return ret; + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + + *size = crat_table->length; + pr_info("Virtual CRAT table created for GPU\n"); + + return ret; +} + +/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and + * creates a Virtual CRAT (VCRAT) image + * + * NOTE: Call kfd_destroy_crat_image to free CRAT image memory + * + * @crat_image: VCRAT image created because ACPI does not have a + * CRAT for this device + * @size: [OUT] size of virtual crat_image + * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device + * COMPUTE_UNIT_GPU - Create VCRAT for GPU + * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU + * -- this option is not currently implemented. + * The assumption is that all AMD APUs will have CRAT + * @kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU + * + * Return 0 if successful else return -ve value + */ +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, + uint32_t proximity_domain) +{ + void *pcrat_image = NULL; + int ret = 0; + + if (!crat_image) + return -EINVAL; + + *crat_image = NULL; + + /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and + * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover + * all the current conditions. A check is put not to overwrite beyond + * allocated size + */ + switch (flags) { + case COMPUTE_UNIT_CPU: + pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_CPU; + ret = kfd_create_vcrat_image_cpu(pcrat_image, size); + break; + case COMPUTE_UNIT_GPU: + if (!kdev) + return -EINVAL; + pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_GPU; + ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, + proximity_domain); + break; + case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): + /* TODO: */ + ret = -EINVAL; + pr_err("VCRAT not implemented for APU\n"); + break; + default: + ret = -EINVAL; + } + + if (!ret) + *crat_image = pcrat_image; + else + kfree(pcrat_image); + + return ret; +} + + +/* kfd_destroy_crat_image + * + * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) + * + */ +void kfd_destroy_crat_image(void *crat_image) +{ + kfree(crat_image); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index a374fa3d3ee6..b5cd182b9edd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -44,6 +44,10 @@ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1) +/* Compute Unit flags */ +#define COMPUTE_UNIT_CPU (1 << 0) /* Create Virtual CRAT for CPU */ +#define COMPUTE_UNIT_GPU (1 << 1) /* Create Virtual CRAT for GPU */ + struct crat_header { uint32_t signature; uint32_t length; @@ -105,7 +109,7 @@ struct crat_subtype_computeunit { uint8_t wave_front_size; uint8_t num_banks; uint16_t micro_engine_id; - uint8_t num_arrays; + uint8_t array_count; uint8_t num_cu_per_array; uint8_t num_simd_per_cu; uint8_t max_slots_scatch_cu; @@ -127,13 +131,14 @@ struct crat_subtype_memory { uint8_t length; uint16_t reserved; uint32_t flags; - uint32_t promixity_domain; + uint32_t proximity_domain; uint32_t base_addr_low; uint32_t base_addr_high; uint32_t length_low; uint32_t length_high; uint32_t width; - uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; + uint8_t visibility_type; /* for virtual (dGPU) CRAT */ + uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; }; /* @@ -222,9 +227,12 @@ struct crat_subtype_ccompute { /* * HSA IO Link Affinity structure and definitions */ -#define CRAT_IOLINK_FLAGS_ENABLED 0x00000001 -#define CRAT_IOLINK_FLAGS_COHERENCY 0x00000002 -#define CRAT_IOLINK_FLAGS_RESERVED 0xfffffffc +#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 /* * IO interface types @@ -232,10 +240,18 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_UNDEFINED 0 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 -#define CRAT_IOLINK_TYPE_OTHER 3 +#define CRAT_IOLINK_TYPE_AMBA 3 +#define CRAT_IOLINK_TYPE_MIPI 4 +#define CRAT_IOLINK_TYPE_QPI_1_1 5 +#define CRAT_IOLINK_TYPE_RESERVED1 6 +#define CRAT_IOLINK_TYPE_RESERVED2 7 +#define CRAT_IOLINK_TYPE_RAPID_IO 8 +#define CRAT_IOLINK_TYPE_INFINIBAND 9 +#define CRAT_IOLINK_TYPE_RESERVED3 10 +#define CRAT_IOLINK_TYPE_OTHER 11 #define CRAT_IOLINK_TYPE_MAX 255 -#define CRAT_IOLINK_RESERVED_LENGTH 24 +#define CRAT_IOLINK_RESERVED_LENGTH 24 struct crat_subtype_iolink { uint8_t type; @@ -291,4 +307,14 @@ struct cdit_header { #pragma pack() +struct kfd_dev; + +int kfd_create_crat_image_acpi(void **crat_image, size_t *size); +void kfd_destroy_crat_image(void *crat_image); +int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, + uint32_t proximity_domain); +int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + int flags, struct kfd_dev *kdev, + uint32_t proximity_domain); + #endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index c407f6bd9956..afb26f205d29 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -95,7 +95,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, ib_packet->bitfields3.ib_base_hi = largep->u.high_part; ib_packet->control = (1 << 23) | (1 << 31) | - ((size_in_bytes / sizeof(uint32_t)) & 0xfffff); + ((size_in_bytes / 4) & 0xfffff); ib_packet->bitfields5.pasid = pasid; @@ -126,8 +126,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, rm_packet->header.opcode = IT_RELEASE_MEM; rm_packet->header.type = PM4_TYPE_3; - rm_packet->header.count = sizeof(struct pm4__release_mem) / - sizeof(unsigned int) - 2; + rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2; rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; rm_packet->bitfields2.event_index = @@ -652,8 +651,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[0].header.opcode = IT_SET_UCONFIG_REG; packets_vec[0].header.type = PM4_TYPE_3; packets_vec[0].bitfields2.reg_offset = - GRBM_GFX_INDEX / (sizeof(uint32_t)) - - USERCONFIG_REG_BASE; + GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; packets_vec[0].bitfields2.insert_vmid = 0; packets_vec[0].reg_data[0] = reg_gfx_index.u32All; @@ -661,8 +659,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[1].header.count = 1; packets_vec[1].header.opcode = IT_SET_CONFIG_REG; packets_vec[1].header.type = PM4_TYPE_3; - packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) - - AMD_CONFIG_REG_BASE; + packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE; packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET; packets_vec[1].bitfields2.insert_vmid = 1; @@ -678,8 +675,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, packets_vec[2].ordinal1 = packets_vec[0].ordinal1; packets_vec[2].bitfields2.reg_offset = - GRBM_GFX_INDEX / (sizeof(uint32_t)) - - USERCONFIG_REG_BASE; + GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE; packets_vec[2].bitfields2.insert_vmid = 0; packets_vec[2].reg_data[0] = reg_gfx_index.u32All; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c new file mode 100644 index 000000000000..4bd6ebfaf425 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -0,0 +1,75 @@ +/* + * Copyright 2016-2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/debugfs.h> +#include "kfd_priv.h" + +static struct dentry *debugfs_root; + +static int kfd_debugfs_open(struct inode *inode, struct file *file) +{ + int (*show)(struct seq_file *, void *) = inode->i_private; + + return single_open(file, show, NULL); +} + +static const struct file_operations kfd_debugfs_fops = { + .owner = THIS_MODULE, + .open = kfd_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void kfd_debugfs_init(void) +{ + struct dentry *ent; + + debugfs_root = debugfs_create_dir("kfd", NULL); + if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { + pr_warn("Failed to create kfd debugfs dir\n"); + return; + } + + ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_mqds_by_process, + &kfd_debugfs_fops); + if (!ent) + pr_warn("Failed to create mqds in kfd debugfs\n"); + + ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_hqds_by_device, + &kfd_debugfs_fops); + if (!ent) + pr_warn("Failed to create hqds in kfd debugfs\n"); + + ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, + kfd_debugfs_rls_by_device, + &kfd_debugfs_fops); + if (!ent) + pr_warn("Failed to create rls in kfd debugfs\n"); +} + +void kfd_debugfs_fini(void) +{ + debugfs_remove_recursive(debugfs_root); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 621a3b53a038..a8fa33a08de3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -27,6 +27,7 @@ #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers_vi.h" +#include "cwsr_trap_handler_gfx8.asm" #define MQD_SIZE_ALIGNED 768 @@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = { .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, }; static const struct kfd_device_info carrizo_device_info = { @@ -49,7 +51,8 @@ static const struct kfd_device_info carrizo_device_info = { .ih_ring_entry_size = 4 * sizeof(uint32_t), .event_interrupt_class = &event_interrupt_class_cik, .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, }; struct kfd_deviceid { @@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, return AMD_IOMMU_INV_PRI_RSP_INVALID; } +static void kfd_cwsr_init(struct kfd_dev *kfd) +{ + if (cwsr_enable && kfd->device_info->supports_cwsr) { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); + + kfd->cwsr_isa = cwsr_trap_gfx8_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); + kfd->cwsr_enabled = true; + } +} + bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources) { @@ -224,6 +238,17 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd - kfd->vm_info.first_vmid_kfd + 1; + /* Verify module parameters regarding mapped process number*/ + if ((hws_max_conc_proc < 0) + || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) { + dev_err(kfd_device, + "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n", + hws_max_conc_proc, kfd->vm_info.vmid_num_kfd, + kfd->vm_info.vmid_num_kfd); + kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd; + } else + kfd->max_proc_per_quantum = hws_max_conc_proc; + /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -286,6 +311,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto device_iommu_pasid_error; } + kfd_cwsr_init(kfd); + if (kfd_resume(kfd)) goto kfd_resume_error; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e202921c150e..d0693fd8cbf8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -149,8 +149,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd, - int *allocated_vmid) + struct qcm_process_device *qpd) { int retval; @@ -170,9 +169,11 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (retval) goto out_unlock; } - *allocated_vmid = qpd->vmid; q->properties.vmid = qpd->vmid; + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) retval = create_compute_queue_nocpsch(dqm, q, qpd); else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) @@ -181,10 +182,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, retval = -EINVAL; if (retval) { - if (list_empty(&qpd->queues_list)) { + if (list_empty(&qpd->queues_list)) deallocate_vmid(dqm, qpd, q); - *allocated_vmid = 0; - } goto out_unlock; } @@ -809,16 +808,13 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, } static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd, int *allocate_vmid) + struct qcm_process_device *qpd) { int retval; struct mqd_manager *mqd; retval = 0; - if (allocate_vmid) - *allocate_vmid = 0; - mutex_lock(&dqm->lock); if (dqm->total_queue_count >= max_num_of_queues_per_device) { @@ -846,6 +842,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, } dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) @@ -1110,6 +1109,26 @@ out: return retval; } +static int set_trap_handler(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr) +{ + uint64_t *tma; + + if (dqm->dev->cwsr_enabled) { + /* Jump from CWSR trap handler to user trap */ + tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET); + tma[0] = tba_addr; + tma[1] = tma_addr; + } else { + qpd->tba_addr = tba_addr; + qpd->tma_addr = tma_addr; + } + + return 0; +} + static int process_termination_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -1241,6 +1260,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.create_kernel_queue = create_kernel_queue_cpsch; dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; dqm->ops.process_termination = process_termination_cpsch; break; case KFD_SCHED_POLICY_NO_HWS: @@ -1256,6 +1276,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.initialize = initialize_nocpsch; dqm->ops.uninitialize = uninitialize; dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; dqm->ops.process_termination = process_termination_nocpsch; break; default: @@ -1290,3 +1311,74 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) dqm->ops.uninitialize(dqm); kfree(dqm); } + +#if defined(CONFIG_DEBUG_FS) + +static void seq_reg_dump(struct seq_file *m, + uint32_t (*dump)[2], uint32_t n_regs) +{ + uint32_t i, count; + + for (i = 0, count = 0; i < n_regs; i++) { + if (count == 0 || + dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) { + seq_printf(m, "%s %08x: %08x", + i ? "\n" : "", + dump[i][0], dump[i][1]); + count = 7; + } else { + seq_printf(m, " %08x", dump[i][1]); + count--; + } + } + + seq_puts(m, "\n"); +} + +int dqm_debugfs_hqds(struct seq_file *m, void *data) +{ + struct device_queue_manager *dqm = data; + uint32_t (*dump)[2], n_regs; + int pipe, queue; + int r = 0; + + for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { + int pipe_offset = pipe * get_queues_per_pipe(dqm); + + for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) { + if (!test_bit(pipe_offset + queue, + dqm->dev->shared_resources.queue_bitmap)) + continue; + + r = dqm->dev->kfd2kgd->hqd_dump( + dqm->dev->kgd, pipe, queue, &dump, &n_regs); + if (r) + break; + + seq_printf(m, " CP Pipe %d, Queue %d\n", + pipe, queue); + seq_reg_dump(m, dump, n_regs); + + kfree(dump); + } + } + + for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { + for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { + r = dqm->dev->kfd2kgd->hqd_sdma_dump( + dqm->dev->kgd, pipe, queue, &dump, &n_regs); + if (r) + break; + + seq_printf(m, " SDMA Engine %d, RLC %d\n", + pipe, queue); + seq_reg_dump(m, dump, n_regs); + + kfree(dump); + } + } + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 5b77cb69f732..c61b693bfa8c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -84,8 +84,7 @@ struct device_process_node { struct device_queue_manager_ops { int (*create_queue)(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd, - int *allocate_vmid); + struct qcm_process_device *qpd); int (*destroy_queue)(struct device_queue_manager *dqm, struct qcm_process_device *qpd, @@ -123,6 +122,11 @@ struct device_queue_manager_ops { void __user *alternate_aperture_base, uint64_t alternate_aperture_size); + int (*set_trap_handler)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + uint64_t tba_addr, + uint64_t tma_addr); + int (*process_termination)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index feb76c235b1a..ebb4da14e3df 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -116,8 +116,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) pr_debug("doorbell aperture size == 0x%08lX\n", kfd->shared_resources.doorbell_aperture_size); - pr_debug("doorbell kernel address == 0x%08lX\n", - (uintptr_t)kfd->doorbell_kernel_ptr); + pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr); return 0; } @@ -194,8 +193,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, pr_debug("Get kernel queue doorbell\n" " doorbell offset == 0x%08X\n" - " kernel address == 0x%08lX\n", - *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx)); + " kernel address == %p\n", + *doorbell_off, (kfd->doorbell_kernel_ptr + inx)); return kfd->doorbell_kernel_ptr + inx; } @@ -215,7 +214,7 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value) { if (db) { writel(value, db); - pr_debug("Writing %d to doorbell address 0x%p\n", value, db); + pr_debug("Writing %d to doorbell address %p\n", value, db); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index cb92d4b72400..93aae5c1e78b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -441,7 +441,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. + * running so the lookup function increments the process ref count. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -493,7 +493,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, } mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + kfd_unref_process(p); } static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) @@ -847,7 +847,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. + * running so the lookup function increments the process ref count. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct mm_struct *mm; @@ -860,7 +860,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, */ mm = get_task_mm(p->lead_thread); if (!mm) { - mutex_unlock(&p->mutex); + kfd_unref_process(p); return; /* Process is exiting */ } @@ -903,7 +903,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, &memory_exception_data); mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + kfd_unref_process(p); } void kfd_signal_hw_exception_event(unsigned int pasid) @@ -911,7 +911,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid) /* * Because we are called from arbitrary context (workqueue) as opposed * to process context, kfd_process could attempt to exit while we are - * running so the lookup function returns a locked process. + * running so the lookup function increments the process ref count. */ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -924,5 +924,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid) lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL); mutex_unlock(&p->event_mutex); - mutex_unlock(&p->mutex); + kfd_unref_process(p); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index c59384bbbc5f..7377513050e6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -300,9 +300,14 @@ int kfd_init_apertures(struct kfd_process *process) struct kfd_process_device *pdd; /*Iterating over all devices*/ - while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && + while (kfd_topology_enum_kfd_devices(id, &dev) == 0 && id < NUM_OF_SUPPORTED_GPUS) { + if (!dev) { + id++; /* Skip non GPU devices */ + continue; + } + pdd = kfd_create_process_device_data(dev, process); if (!pdd) { pr_err("Failed to create process device data\n"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 8b0c0645d7c0..5dc6567d4a13 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -218,7 +218,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, rptr = *kq->rptr_kernel; wptr = *kq->wptr_kernel; queue_address = (unsigned int *)kq->pq_kernel_addr; - queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t); + queue_size_dwords = kq->queue->properties.queue_size / 4; pr_debug("rptr: %d\n", rptr); pr_debug("wptr: %d\n", wptr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 6c5a9cab55de..3ac72bed4f31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -24,6 +24,7 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/device.h> +#include <linux/printk.h> #include "kfd_priv.h" #define KFD_DRIVER_AUTHOR "AMD Inc. and others" @@ -49,6 +50,15 @@ module_param(sched_policy, int, 0444); MODULE_PARM_DESC(sched_policy, "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); +int hws_max_conc_proc = 8; +module_param(hws_max_conc_proc, int, 0444); +MODULE_PARM_DESC(hws_max_conc_proc, + "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); + +int cwsr_enable = 1; +module_param(cwsr_enable, int, 0444); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); + int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); MODULE_PARM_DESC(max_num_of_queues_per_device, @@ -59,6 +69,11 @@ module_param(send_sigterm, int, 0444); MODULE_PARM_DESC(send_sigterm, "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); +int ignore_crat; +module_param(ignore_crat, int, 0444); +MODULE_PARM_DESC(ignore_crat, + "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); + static int amdkfd_init_completed; int kgd2kfd_init(unsigned int interface_version, @@ -113,6 +128,8 @@ static int __init kfd_module_init(void) kfd_process_create_wq(); + kfd_debugfs_init(); + amdkfd_init_completed = 1; dev_info(kfd_device, "Initialized module\n"); @@ -129,10 +146,11 @@ static void __exit kfd_module_exit(void) { amdkfd_init_completed = 0; + kfd_debugfs_fini(); kfd_process_destroy_wq(); kfd_topology_shutdown(); kfd_chardev_exit(); - dev_info(kfd_device, "Removed module\n"); + pr_info("amdkfd: Removed module\n"); } module_init(kfd_module_init); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 1f3a6ba7eed2..8972bcfbf701 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -85,6 +85,10 @@ struct mqd_manager { uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); +#if defined(CONFIG_DEBUG_FS) + int (*debugfs_show_mqd)(struct seq_file *m, void *data); +#endif + struct mutex mqd_mutex; struct kfd_dev *dev; }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 4859d263fa2a..f8ef4a051e08 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -36,6 +36,11 @@ static inline struct cik_mqd *get_mqd(void *mqd) return (struct cik_mqd *)mqd; } +static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) +{ + return (struct cik_sdma_rlc_registers *)mqd; +} + static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -149,7 +154,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, { /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); + uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, (uint32_t __user *)p->write_ptr, @@ -160,7 +165,9 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, struct queue_properties *p, struct mm_struct *mms) { - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd); + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); } static int update_mqd(struct mqd_manager *mm, void *mqd, @@ -176,8 +183,7 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, * Calculating queue size which is log base 2 of actual queue size -1 * dwords and another -1 for ffs */ - m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - - 1 - 1; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); @@ -202,8 +208,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct cik_sdma_rlc_registers *m; m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << - SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; @@ -343,8 +349,7 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, * Calculating queue size which is log base 2 of actual queue * size -1 dwords */ - m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - - 1 - 1; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); @@ -360,15 +365,25 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return 0; } -struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) -{ - struct cik_sdma_rlc_registers *m; +#if defined(CONFIG_DEBUG_FS) - m = (struct cik_sdma_rlc_registers *)mqd; +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_mqd), false); + return 0; +} - return m; +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct cik_sdma_rlc_registers), false); + return 0; } +#endif + + struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { @@ -392,6 +407,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -400,6 +418,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif break; case KFD_MQD_TYPE_SDMA: mqd->init_mqd = init_mqd_sdma; @@ -408,6 +429,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif break; default: kfree(mqd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 4ea854f9007b..971aec0637dc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -30,7 +30,7 @@ #include "vi_structs.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" - +#include "oss/oss_3_0_sh_mask.h" #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 static inline struct vi_mqd *get_mqd(void *mqd) @@ -38,6 +38,11 @@ static inline struct vi_mqd *get_mqd(void *mqd) return (struct vi_mqd *)mqd; } +static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct vi_sdma_mqd *)mqd; +} + static int init_mqd(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -84,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = 1; + if (q->tba_addr) { + m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8); + m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8); + m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8); + m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8); + m->compute_pgm_rsrc2 |= + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + } + + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + *mqd = m; if (gart_addr) *gart_addr = addr; @@ -98,7 +125,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, { /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); - uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1); + uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1); return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, (uint32_t __user *)p->write_ptr, @@ -116,8 +143,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT | atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT | mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT; - m->cp_hqd_pq_control |= - ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); @@ -147,7 +173,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, * is safe, giving a maximum field value of 0xA. */ m->cp_hqd_eop_control |= min(0xA, - ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + order_base_2(q->eop_ring_buffer_size / 4) - 1); m->cp_hqd_eop_base_addr_lo = lower_32_bits(q->eop_ring_buffer_address >> 8); m->cp_hqd_eop_base_addr_hi = @@ -163,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; } + if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + m->cp_hqd_ctx_save_control = + atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | + mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; + q->is_active = (q->queue_size > 0 && q->queue_address != 0 && q->queue_percent > 0); @@ -234,6 +265,117 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, return retval; } +static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + int retval; + struct vi_sdma_mqd *m; + + + retval = kfd_gtt_sa_allocate(mm->dev, + sizeof(struct vi_sdma_mqd), + mqd_mem_obj); + + if (retval != 0) + return -ENOMEM; + + m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + + memset(m, 0, sizeof(struct vi_sdma_mqd)); + + *mqd = m; + if (gart_addr != NULL) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); + + return retval; +} + +static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct vi_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell = + q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT; + + m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0); + + return 0; +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct vi_sdma_mqd), false); + return 0; +} + +#endif + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, struct kfd_dev *dev) { @@ -257,6 +399,9 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif break; case KFD_MQD_TYPE_HIQ: mqd->init_mqd = init_mqd_hiq; @@ -265,8 +410,20 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif break; case KFD_MQD_TYPE_SDMA: + mqd->init_mqd = init_mqd_sdma; + mqd->uninit_mqd = uninit_mqd_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif break; default: kfree(mqd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 16da8ad02d8b..0ecbd1f9b606 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -45,7 +45,7 @@ static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) header.u32All = 0; header.opcode = opcode; - header.count = packet_size/sizeof(uint32_t) - 2; + header.count = packet_size / 4 - 2; header.type = PM4_TYPE_3; return header.u32All; @@ -55,15 +55,27 @@ static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, bool *over_subscription) { - unsigned int process_count, queue_count; + unsigned int process_count, queue_count, compute_queue_count; unsigned int map_queue_size; + unsigned int max_proc_per_quantum = 1; + struct kfd_dev *dev = pm->dqm->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count; - /* check if there is over subscription*/ + /* check if there is over subscription + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ *over_subscription = false; - if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) { + + if (dev->max_proc_per_quantum > 1) + max_proc_per_quantum = dev->max_proc_per_quantum; + + if ((process_count > max_proc_per_quantum) || + compute_queue_count > get_queues_num(pm->dqm)) { *over_subscription = true; pr_debug("Over subscribed runlist\n"); } @@ -116,10 +128,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, uint64_t ib, size_t ib_size_in_dwords, bool chain) { struct pm4_mes_runlist *packet; + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; if (WARN_ON(!ib)) return -EFAULT; + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + packet = (struct pm4_mes_runlist *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_runlist)); @@ -130,6 +156,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, packet->bitfields4.chain = chain ? 1 : 0; packet->bitfields4.offload_polling = 0; packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; packet->ordinal2 = lower_32_bits(ib); packet->bitfields3.ib_base_hi = upper_32_bits(ib); @@ -251,6 +278,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; *rl_size_bytes = alloc_size_bytes; + pm->ib_size_bytes = alloc_size_bytes; pr_debug("Building runlist ib process count: %d queues count %d\n", pm->dqm->processes_count, pm->dqm->queue_count); @@ -564,3 +592,26 @@ void pm_release_ib(struct packet_manager *pm) } mutex_unlock(&pm->lock); } + +#if defined(CONFIG_DEBUG_FS) + +int pm_debugfs_runlist(struct seq_file *m, void *data) +{ + struct packet_manager *pm = data; + + mutex_lock(&pm->lock); + + if (!pm->allocated) { + seq_puts(m, " No active runlist\n"); + goto out; + } + + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); + +out: + mutex_unlock(&pm->lock); + return 0; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c index d6a796144269..15fff4420e53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -59,7 +59,7 @@ unsigned int kfd_pasid_alloc(void) struct kfd_dev *dev = NULL; unsigned int i = 0; - while ((dev = kfd_topology_enum_kfd_devices(i)) != NULL) { + while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) { if (dev && dev->kfd2kgd) { kfd2kgd = dev->kfd2kgd; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 9e4134c5b481..6a48d29ada47 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -33,6 +33,8 @@ #include <linux/kfd_ioctl.h> #include <linux/idr.h> #include <linux/kfifo.h> +#include <linux/seq_file.h> +#include <linux/kref.h> #include <kgd_kfd_interface.h> #include "amd_shared.h" @@ -41,6 +43,7 @@ #define KFD_MMAP_DOORBELL_MASK 0x8000000000000 #define KFD_MMAP_EVENTS_MASK 0x4000000000000 +#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000 /* * When working with cp scheduler we should assign the HIQ manually or via @@ -63,6 +66,15 @@ #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024 /* + * Size of the per-process TBA+TMA buffer: 2 pages + * + * The first page is the TBA used for the CWSR ISA code. The second + * page is used as TMA for daisy changing a user-mode trap handler. + */ +#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) +#define KFD_CWSR_TMA_OFFSET PAGE_SIZE + +/* * Kernel module parameter to specify maximum number of supported queues per * device */ @@ -79,11 +91,25 @@ extern int max_num_of_queues_per_device; extern int sched_policy; /* + * Kernel module parameter to specify the maximum process + * number per HW scheduler + */ +extern int hws_max_conc_proc; + +extern int cwsr_enable; + +/* * Kernel module parameter to specify whether to send sigterm to HSA process on * unhandled exception */ extern int send_sigterm; +/* + * Ignore CRAT table during KFD initialization, can be used to work around + * broken CRAT tables on some AMD systems + */ +extern int ignore_crat; + /** * enum kfd_sched_policy * @@ -131,6 +157,7 @@ struct kfd_device_info { size_t ih_ring_entry_size; uint8_t num_of_watch_points; uint16_t mqd_size_aligned; + bool supports_cwsr; }; struct kfd_mem_obj { @@ -200,6 +227,14 @@ struct kfd_dev { /* Debug manager */ struct kfd_dbgmgr *dbgmgr; + + /* Maximum process number mapped to HW scheduler */ + unsigned int max_proc_per_quantum; + + /* CWSR */ + bool cwsr_enabled; + const void *cwsr_isa; + unsigned int cwsr_isa_size; }; /* KGD2KFD callbacks */ @@ -332,6 +367,9 @@ struct queue_properties { uint32_t eop_ring_buffer_size; uint64_t ctx_save_restore_area_address; uint32_t ctx_save_restore_area_size; + uint32_t ctl_stack_size; + uint64_t tba_addr; + uint64_t tma_addr; }; /** @@ -439,6 +477,11 @@ struct qcm_process_device { uint32_t num_gws; uint32_t num_oac; uint32_t sh_hidden_private_base; + + /* CWSR memory */ + void *cwsr_kaddr; + uint64_t tba_addr; + uint64_t tma_addr; }; @@ -501,6 +544,9 @@ struct kfd_process { */ void *mm; + struct kref ref; + struct work_struct release_work; + struct mutex mutex; /* @@ -563,9 +609,10 @@ struct amdkfd_ioctl_desc { void kfd_process_create_wq(void); void kfd_process_destroy_wq(void); -struct kfd_process *kfd_create_process(const struct task_struct *); +struct kfd_process *kfd_create_process(struct file *filep); struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); +void kfd_unref_process(struct kfd_process *p); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process *p); @@ -577,6 +624,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process *p); +int kfd_reserved_mem_mmap(struct kfd_process *process, + struct vm_area_struct *vma); + /* Process device data iterator */ struct kfd_process_device *kfd_get_first_process_device_data( struct kfd_process *p); @@ -624,9 +674,12 @@ int kfd_topology_init(void); void kfd_topology_shutdown(void); int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu); +struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + uint32_t proximity_domain); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); -struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx); +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); +int kfd_numa_node_to_apic_id(int numa_node_id); /* Interrupts */ int kfd_interrupt_init(struct kfd_dev *dev); @@ -643,8 +696,6 @@ int kgd2kfd_resume(struct kfd_dev *kfd); int kfd_init_apertures(struct kfd_process *process); /* Queue Context Management */ -struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd); - int init_queue(struct queue **q, const struct queue_properties *properties); void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); @@ -699,6 +750,7 @@ struct packet_manager { struct mutex lock; bool allocated; struct kfd_mem_obj *ib_buffer_obj; + unsigned int ib_size_bytes; }; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); @@ -745,4 +797,23 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); +/* Debugfs */ +#if defined(CONFIG_DEBUG_FS) + +void kfd_debugfs_init(void); +void kfd_debugfs_fini(void); +int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data); +int pqm_debugfs_mqds(struct seq_file *m, void *data); +int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data); +int dqm_debugfs_hqds(struct seq_file *m, void *data); +int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); +int pm_debugfs_runlist(struct seq_file *m, void *data); + +#else + +static inline void kfd_debugfs_init(void) {} +static inline void kfd_debugfs_fini(void) {} + +#endif + #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 1f5ccd28bd41..a22fb0710f15 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -24,10 +24,12 @@ #include <linux/log2.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/notifier.h> #include <linux/compat.h> +#include <linux/mman.h> struct mm_struct; @@ -46,13 +48,12 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu); static struct workqueue_struct *kfd_process_wq; -struct kfd_process_release_work { - struct work_struct kfd_work; - struct kfd_process *p; -}; - static struct kfd_process *find_process(const struct task_struct *thread); -static struct kfd_process *create_process(const struct task_struct *thread); +static void kfd_process_ref_release(struct kref *ref); +static struct kfd_process *create_process(const struct task_struct *thread, + struct file *filep); +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); + void kfd_process_create_wq(void) { @@ -68,9 +69,10 @@ void kfd_process_destroy_wq(void) } } -struct kfd_process *kfd_create_process(const struct task_struct *thread) +struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; + struct task_struct *thread = current; if (!thread->mm) return ERR_PTR(-EINVAL); @@ -79,9 +81,6 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); - /* Take mmap_sem because we call __mmu_notifier_register inside */ - down_write(&thread->mm->mmap_sem); - /* * take kfd processes mutex before starting of process creation * so there won't be a case where two threads of the same process @@ -93,14 +92,11 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread) process = find_process(thread); if (process) pr_debug("Process already found\n"); - - if (!process) - process = create_process(thread); + else + process = create_process(thread, filep); mutex_unlock(&kfd_processes_mutex); - up_write(&thread->mm->mmap_sem); - return process; } @@ -144,63 +140,75 @@ static struct kfd_process *find_process(const struct task_struct *thread) return p; } -static void kfd_process_wq_release(struct work_struct *work) +void kfd_unref_process(struct kfd_process *p) +{ + kref_put(&p->ref, kfd_process_ref_release); +} + +static void kfd_process_destroy_pdds(struct kfd_process *p) { - struct kfd_process_release_work *my_work; struct kfd_process_device *pdd, *temp; - struct kfd_process *p; - my_work = (struct kfd_process_release_work *) work; + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { + pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", + pdd->dev->id, p->pasid); - p = my_work->p; + list_del(&pdd->per_device_list); - pr_debug("Releasing process (pasid %d) in workqueue\n", - p->pasid); + if (pdd->qpd.cwsr_kaddr) + free_pages((unsigned long)pdd->qpd.cwsr_kaddr, + get_order(KFD_CWSR_TBA_TMA_SIZE)); - mutex_lock(&p->mutex); + kfree(pdd); + } +} - list_for_each_entry_safe(pdd, temp, &p->per_device_data, - per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n", - pdd->dev->id, p->pasid); +/* No process locking is needed in this function, because the process + * is not findable any more. We must assume that no other thread is + * using it any more, otherwise we couldn't safely free the process + * structure in the end. + */ +static void kfd_process_wq_release(struct work_struct *work) +{ + struct kfd_process *p = container_of(work, struct kfd_process, + release_work); + struct kfd_process_device *pdd; + + pr_debug("Releasing process (pasid %d) in workqueue\n", p->pasid); + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { if (pdd->bound == PDD_BOUND) amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); - - list_del(&pdd->per_device_list); - kfree(pdd); } + kfd_process_destroy_pdds(p); + kfd_event_free_process(p); kfd_pasid_free(p->pasid); kfd_free_process_doorbells(p); - mutex_unlock(&p->mutex); - mutex_destroy(&p->mutex); - kfree(p); + put_task_struct(p->lead_thread); - kfree(work); + kfree(p); } -static void kfd_process_destroy_delayed(struct rcu_head *rcu) +static void kfd_process_ref_release(struct kref *ref) { - struct kfd_process_release_work *work; - struct kfd_process *p; + struct kfd_process *p = container_of(ref, struct kfd_process, ref); - p = container_of(rcu, struct kfd_process, rcu); - - mmdrop(p->mm); + INIT_WORK(&p->release_work, kfd_process_wq_release); + queue_work(kfd_process_wq, &p->release_work); +} - work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC); +static void kfd_process_destroy_delayed(struct rcu_head *rcu) +{ + struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); - if (work) { - INIT_WORK((struct work_struct *) work, kfd_process_wq_release); - work->p = p; - queue_work(kfd_process_wq, (struct work_struct *) work); - } + kfd_unref_process(p); } static void kfd_process_notifier_release(struct mmu_notifier *mn, @@ -244,15 +252,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, kfd_process_dequeue_from_all_devices(p); pqm_uninit(&p->pqm); + /* Indicate to other users that MM is no longer valid */ + p->mm = NULL; + mutex_unlock(&p->mutex); - /* - * Because we drop mm_count inside kfd_process_destroy_delayed - * and because the mmu_notifier_unregister function also drop - * mm_count we need to take an extra count here. - */ - mmgrab(p->mm); - mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm); + mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); } @@ -260,7 +265,44 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .release = kfd_process_notifier_release, }; -static struct kfd_process *create_process(const struct task_struct *thread) +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) +{ + unsigned long offset; + struct kfd_process_device *pdd = NULL; + struct kfd_dev *dev = NULL; + struct qcm_process_device *qpd = NULL; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + dev = pdd->dev; + qpd = &pdd->qpd; + if (!dev->cwsr_enabled || qpd->cwsr_kaddr) + continue; + offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT; + qpd->tba_addr = (int64_t)vm_mmap(filep, 0, + KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, + MAP_SHARED, offset); + + if (IS_ERR_VALUE(qpd->tba_addr)) { + int err = qpd->tba_addr; + + pr_err("Failure to set tba address. error %d.\n", err); + qpd->tba_addr = 0; + qpd->cwsr_kaddr = NULL; + return err; + } + + memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); + + qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET; + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n", + qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr); + } + + return 0; +} + +static struct kfd_process *create_process(const struct task_struct *thread, + struct file *filep) { struct kfd_process *process; int err = -ENOMEM; @@ -277,13 +319,15 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (kfd_alloc_process_doorbells(process) < 0) goto err_alloc_doorbells; + kref_init(&process->ref); + mutex_init(&process->mutex); process->mm = thread->mm; /* register notifier */ process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; - err = __mmu_notifier_register(&process->mmu_notifier, process->mm); + err = mmu_notifier_register(&process->mmu_notifier, process->mm); if (err) goto err_mmu_notifier; @@ -291,6 +335,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) (uintptr_t)process->mm); process->lead_thread = thread->group_leader; + get_task_struct(process->lead_thread); INIT_LIST_HEAD(&process->per_device_data); @@ -306,8 +351,14 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (err != 0) goto err_init_apertures; + err = kfd_process_init_cwsr(process, filep); + if (err) + goto err_init_cwsr; + return process; +err_init_cwsr: + kfd_process_destroy_pdds(process); err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: @@ -343,16 +394,18 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process_device *pdd = NULL; pdd = kzalloc(sizeof(*pdd), GFP_KERNEL); - if (pdd != NULL) { - pdd->dev = dev; - INIT_LIST_HEAD(&pdd->qpd.queues_list); - INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); - pdd->qpd.dqm = dev->dqm; - pdd->process = p; - pdd->bound = PDD_UNBOUND; - pdd->already_dequeued = false; - list_add(&pdd->per_device_list, &p->per_device_data); - } + if (!pdd) + return NULL; + + pdd->dev = dev; + INIT_LIST_HEAD(&pdd->qpd.queues_list); + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + pdd->qpd.pqm = &p->pqm; + pdd->process = p; + pdd->bound = PDD_UNBOUND; + pdd->already_dequeued = false; + list_add(&pdd->per_device_list, &p->per_device_data); return pdd; } @@ -483,6 +536,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) mutex_unlock(kfd_get_dbgmgr_mutex()); + mutex_lock(&p->mutex); + pdd = kfd_get_process_device_data(dev, p); if (pdd) /* For GPU relying on IOMMU, we need to dequeue here @@ -491,6 +546,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) kfd_process_dequeue_from_device(pdd); mutex_unlock(&p->mutex); + + kfd_unref_process(p); } struct kfd_process_device *kfd_get_first_process_device_data( @@ -515,22 +572,86 @@ bool kfd_has_process_device_data(struct kfd_process *p) return !(list_empty(&p->per_device_data)); } -/* This returns with process->mutex locked. */ +/* This increments the process->ref counter. */ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) { - struct kfd_process *p; + struct kfd_process *p, *ret_p = NULL; unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { if (p->pasid == pasid) { - mutex_lock(&p->mutex); + kref_get(&p->ref); + ret_p = p; break; } } srcu_read_unlock(&kfd_processes_srcu, idx); - return p; + return ret_p; } + +int kfd_reserved_mem_mmap(struct kfd_process *process, + struct vm_area_struct *vma) +{ + struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); + struct kfd_process_device *pdd; + struct qcm_process_device *qpd; + + if (!dev) + return -EINVAL; + if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { + pr_err("Incorrect CWSR mapping size.\n"); + return -EINVAL; + } + + pdd = kfd_get_process_device_data(dev, process); + if (!pdd) + return -EINVAL; + qpd = &pdd->qpd; + + qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(KFD_CWSR_TBA_TMA_SIZE)); + if (!qpd->cwsr_kaddr) { + pr_err("Error allocating per process CWSR buffer.\n"); + return -ENOMEM; + } + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + /* Mapping pages to user process */ + return remap_pfn_range(vma, vma->vm_start, + PFN_DOWN(__pa(qpd->cwsr_kaddr)), + KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); +} + +#if defined(CONFIG_DEBUG_FS) + +int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) +{ + struct kfd_process *p; + unsigned int temp; + int r = 0; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + seq_printf(m, "Process %d PASID %d:\n", + p->lead_thread->tgid, p->pasid); + + mutex_lock(&p->mutex); + r = pqm_debugfs_mqds(m, &p->pqm); + mutex_unlock(&p->mutex); + + if (r) + break; + } + + srcu_read_unlock(&kfd_processes_srcu, idx); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 2bec902fc939..876380632668 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -178,10 +178,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, return retval; if (list_empty(&pdd->qpd.queues_list) && - list_empty(&pdd->qpd.priv_queue_list)) { - pdd->qpd.pqm = pqm; + list_empty(&pdd->qpd.priv_queue_list)) dev->dqm->ops.register_process(dev->dqm, &pdd->qpd); - } pqn = kzalloc(sizeof(*pqn), GFP_KERNEL); if (!pqn) { @@ -191,6 +189,23 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: + if (dev->dqm->queue_count >= + CIK_SDMA_QUEUES_PER_ENGINE * CIK_SDMA_ENGINE_NUM) { + pr_err("Over-subscription is not allowed for SDMA.\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && @@ -206,8 +221,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, goto err_create_queue; pqn->q = q; pqn->kq = NULL; - retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, - &q->properties.vmid); + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd); pr_debug("DQM returned %d for create_queue\n", retval); print_queue(q); break; @@ -297,6 +311,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (pqn->q) { dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval) { + pr_debug("Destroy queue failed, returned %d\n", retval); + goto err_destroy_queue; + } uninit_queue(pqn->q); } @@ -308,6 +326,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) list_empty(&pdd->qpd.priv_queue_list)) dqm->ops.unregister_process(dqm, &pdd->qpd); +err_destroy_queue: return retval; } @@ -349,4 +368,67 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } +#if defined(CONFIG_DEBUG_FS) + +int pqm_debugfs_mqds(struct seq_file *m, void *data) +{ + struct process_queue_manager *pqm = data; + struct process_queue_node *pqn; + struct queue *q; + enum KFD_MQD_TYPE mqd_type; + struct mqd_manager *mqd_manager; + int r = 0; + + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { + if (pqn->q) { + q = pqn->q; + switch (q->properties.type) { + case KFD_QUEUE_TYPE_SDMA: + seq_printf(m, " SDMA queue on device %x\n", + q->device->id); + mqd_type = KFD_MQD_TYPE_SDMA; + break; + case KFD_QUEUE_TYPE_COMPUTE: + seq_printf(m, " Compute queue on device %x\n", + q->device->id); + mqd_type = KFD_MQD_TYPE_CP; + break; + default: + seq_printf(m, + " Bad user queue type %d on device %x\n", + q->properties.type, q->device->id); + continue; + } + mqd_manager = q->device->dqm->ops.get_mqd_manager( + q->device->dqm, mqd_type); + } else if (pqn->kq) { + q = pqn->kq->queue; + mqd_manager = pqn->kq->mqd; + switch (q->properties.type) { + case KFD_QUEUE_TYPE_DIQ: + seq_printf(m, " DIQ on device %x\n", + pqn->kq->dev->id); + mqd_type = KFD_MQD_TYPE_HIQ; + break; + default: + seq_printf(m, + " Bad kernel queue type %d on device %x\n", + q->properties.type, + pqn->kq->dev->id); + continue; + } + } else { + seq_printf(m, + " Weird: Queue node with neither kernel nor user queue\n"); + continue; + } + + r = mqd_manager->debugfs_show_mqd(m, q->mqd); + if (r != 0) + break; + } + + return r; +} +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 19ce59028d6b..c6a76090a725 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -28,27 +28,32 @@ #include <linux/hash.h> #include <linux/cpufreq.h> #include <linux/log2.h> +#include <linux/dmi.h> +#include <linux/atomic.h> #include "kfd_priv.h" #include "kfd_crat.h" #include "kfd_topology.h" +#include "kfd_device_queue_manager.h" +/* topology_device_list - Master list of all topology devices */ static struct list_head topology_device_list; -static int topology_crat_parsed; static struct kfd_system_properties sys_props; static DECLARE_RWSEM(topology_lock); +static atomic_t topology_crat_proximity_domain; -struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) +struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + uint32_t proximity_domain) { struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; + struct kfd_topology_device *device = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu_id == gpu_id) { - device = top_dev->gpu; + if (top_dev->proximity_domain == proximity_domain) { + device = top_dev; break; } @@ -57,7 +62,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) return device; } -struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) +struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) { struct kfd_topology_device *top_dev; struct kfd_dev *device = NULL; @@ -65,7 +70,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) - if (top_dev->gpu->pdev == pdev) { + if (top_dev->gpu_id == gpu_id) { device = top_dev->gpu; break; } @@ -75,282 +80,31 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) return device; } -static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size) -{ - struct acpi_table_header *crat_table; - acpi_status status; - - if (!size) - return -EINVAL; - - /* - * Fetch the CRAT table from ACPI - */ - status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table); - if (status == AE_NOT_FOUND) { - pr_warn("CRAT table not found\n"); - return -ENODATA; - } else if (ACPI_FAILURE(status)) { - const char *err = acpi_format_exception(status); - - pr_err("CRAT table error: %s\n", err); - return -EINVAL; - } - - if (*size >= crat_table->length && crat_image != NULL) - memcpy(crat_image, crat_table, crat_table->length); - - *size = crat_table->length; - - return 0; -} - -static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.cpu_cores_count = cu->num_cpu_cores; - dev->node_props.cpu_core_id_base = cu->processor_id_low; - if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; - - pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores, - cu->processor_id_low); -} - -static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, - struct crat_subtype_computeunit *cu) -{ - dev->node_props.simd_id_base = cu->processor_id_low; - dev->node_props.simd_count = cu->num_simd_cores; - dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; - dev->node_props.max_waves_per_simd = cu->max_waves_simd; - dev->node_props.wave_front_size = cu->wave_front_size; - dev->node_props.mem_banks_count = cu->num_banks; - dev->node_props.array_count = cu->num_arrays; - dev->node_props.cu_per_simd_array = cu->num_cu_per_array; - dev->node_props.simd_per_cu = cu->num_simd_per_cu; - dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; - if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) - dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; - pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores, - cu->processor_id_low); -} - -/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */ -static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu) -{ - struct kfd_topology_device *dev; - int i = 0; - - pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n", - cu->proximity_domain, cu->hsa_capability); - list_for_each_entry(dev, &topology_device_list, list) { - if (cu->proximity_domain == i) { - if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) - kfd_populated_cu_info_cpu(dev, cu); - - if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) - kfd_populated_cu_info_gpu(dev, cu); - break; - } - i++; - } - - return 0; -} - -/* - * kfd_parse_subtype_mem is called when the topology mutex is - * already acquired - */ -static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem) -{ - struct kfd_mem_properties *props; - struct kfd_topology_device *dev; - int i = 0; - - pr_info("Found memory entry in CRAT table with proximity_domain=%d\n", - mem->promixity_domain); - list_for_each_entry(dev, &topology_device_list, list) { - if (mem->promixity_domain == i) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - if (dev->node_props.cpu_cores_count == 0) - props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; - else - props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; - - if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) - props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; - if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) - props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; - - props->size_in_bytes = - ((uint64_t)mem->length_high << 32) + - mem->length_low; - props->width = mem->width; - - dev->mem_bank_count++; - list_add_tail(&props->list, &dev->mem_props); - - break; - } - i++; - } - - return 0; -} - -/* - * kfd_parse_subtype_cache is called when the topology mutex - * is already acquired - */ -static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache) -{ - struct kfd_cache_properties *props; - struct kfd_topology_device *dev; - uint32_t id; - - id = cache->processor_id_low; - - pr_info("Found cache entry in CRAT table with processor_id=%d\n", id); - list_for_each_entry(dev, &topology_device_list, list) - if (id == dev->node_props.cpu_core_id_base || - id == dev->node_props.simd_id_base) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - props->processor_id_low = id; - props->cache_level = cache->cache_level; - props->cache_size = cache->cache_size; - props->cacheline_size = cache->cache_line_size; - props->cachelines_per_tag = cache->lines_per_tag; - props->cache_assoc = cache->associativity; - props->cache_latency = cache->cache_latency; - - if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) - props->cache_type |= HSA_CACHE_TYPE_DATA; - if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) - props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; - if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) - props->cache_type |= HSA_CACHE_TYPE_CPU; - if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) - props->cache_type |= HSA_CACHE_TYPE_HSACU; - - dev->cache_count++; - dev->node_props.caches_count++; - list_add_tail(&props->list, &dev->cache_props); - - break; - } - - return 0; -} - -/* - * kfd_parse_subtype_iolink is called when the topology mutex - * is already acquired - */ -static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink) +struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) { - struct kfd_iolink_properties *props; - struct kfd_topology_device *dev; - uint32_t i = 0; - uint32_t id_from; - uint32_t id_to; - - id_from = iolink->proximity_domain_from; - id_to = iolink->proximity_domain_to; + struct kfd_topology_device *top_dev; + struct kfd_dev *device = NULL; - pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from); - list_for_each_entry(dev, &topology_device_list, list) { - if (id_from == i) { - props = kfd_alloc_struct(props); - if (props == NULL) - return -ENOMEM; - - props->node_from = id_from; - props->node_to = id_to; - props->ver_maj = iolink->version_major; - props->ver_min = iolink->version_minor; - - /* - * weight factor (derived from CDIR), currently always 1 - */ - props->weight = 1; - - props->min_latency = iolink->minimum_latency; - props->max_latency = iolink->maximum_latency; - props->min_bandwidth = iolink->minimum_bandwidth_mbs; - props->max_bandwidth = iolink->maximum_bandwidth_mbs; - props->rec_transfer_size = - iolink->recommended_transfer_size; - - dev->io_link_count++; - dev->node_props.io_links_count++; - list_add_tail(&props->list, &dev->io_link_props); + down_read(&topology_lock); + list_for_each_entry(top_dev, &topology_device_list, list) + if (top_dev->gpu->pdev == pdev) { + device = top_dev->gpu; break; } - i++; - } - return 0; -} - -static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr) -{ - struct crat_subtype_computeunit *cu; - struct crat_subtype_memory *mem; - struct crat_subtype_cache *cache; - struct crat_subtype_iolink *iolink; - int ret = 0; - - switch (sub_type_hdr->type) { - case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: - cu = (struct crat_subtype_computeunit *)sub_type_hdr; - ret = kfd_parse_subtype_cu(cu); - break; - case CRAT_SUBTYPE_MEMORY_AFFINITY: - mem = (struct crat_subtype_memory *)sub_type_hdr; - ret = kfd_parse_subtype_mem(mem); - break; - case CRAT_SUBTYPE_CACHE_AFFINITY: - cache = (struct crat_subtype_cache *)sub_type_hdr; - ret = kfd_parse_subtype_cache(cache); - break; - case CRAT_SUBTYPE_TLB_AFFINITY: - /* - * For now, nothing to do here - */ - pr_info("Found TLB entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: - /* - * For now, nothing to do here - */ - pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n"); - break; - case CRAT_SUBTYPE_IOLINK_AFFINITY: - iolink = (struct crat_subtype_iolink *)sub_type_hdr; - ret = kfd_parse_subtype_iolink(iolink); - break; - default: - pr_warn("Unknown subtype (%d) in CRAT\n", - sub_type_hdr->type); - } + up_read(&topology_lock); - return ret; + return device; } +/* Called with write topology_lock acquired */ static void kfd_release_topology_device(struct kfd_topology_device *dev) { struct kfd_mem_properties *mem; struct kfd_cache_properties *cache; struct kfd_iolink_properties *iolink; + struct kfd_perf_properties *perf; list_del(&dev->list); @@ -375,25 +129,35 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev) kfree(iolink); } - kfree(dev); + while (dev->perf_props.next != &dev->perf_props) { + perf = container_of(dev->perf_props.next, + struct kfd_perf_properties, list); + list_del(&perf->list); + kfree(perf); + } - sys_props.num_devices--; + kfree(dev); } -static void kfd_release_live_view(void) +void kfd_release_topology_device_list(struct list_head *device_list) { struct kfd_topology_device *dev; - while (topology_device_list.next != &topology_device_list) { - dev = container_of(topology_device_list.next, - struct kfd_topology_device, list); + while (!list_empty(device_list)) { + dev = list_first_entry(device_list, + struct kfd_topology_device, list); kfd_release_topology_device(dev); + } } +static void kfd_release_live_view(void) +{ + kfd_release_topology_device_list(&topology_device_list); memset(&sys_props, 0, sizeof(sys_props)); } -static struct kfd_topology_device *kfd_create_topology_device(void) +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list) { struct kfd_topology_device *dev; @@ -406,65 +170,13 @@ static struct kfd_topology_device *kfd_create_topology_device(void) INIT_LIST_HEAD(&dev->mem_props); INIT_LIST_HEAD(&dev->cache_props); INIT_LIST_HEAD(&dev->io_link_props); + INIT_LIST_HEAD(&dev->perf_props); - list_add_tail(&dev->list, &topology_device_list); - sys_props.num_devices++; + list_add_tail(&dev->list, device_list); return dev; } -static int kfd_parse_crat_table(void *crat_image) -{ - struct kfd_topology_device *top_dev; - struct crat_subtype_generic *sub_type_hdr; - uint16_t node_id; - int ret; - struct crat_header *crat_table = (struct crat_header *)crat_image; - uint16_t num_nodes; - uint32_t image_len; - - if (!crat_image) - return -EINVAL; - - num_nodes = crat_table->num_domains; - image_len = crat_table->length; - - pr_info("Parsing CRAT table with %d nodes\n", num_nodes); - - for (node_id = 0; node_id < num_nodes; node_id++) { - top_dev = kfd_create_topology_device(); - if (!top_dev) { - kfd_release_live_view(); - return -ENOMEM; - } - } - - sys_props.platform_id = - (*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK; - sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id); - sys_props.platform_rev = crat_table->revision; - - sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); - while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < - ((char *)crat_image) + image_len) { - if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { - ret = kfd_parse_subtype(sub_type_hdr); - if (ret != 0) { - kfd_release_live_view(); - return ret; - } - } - - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length); - } - - sys_props.generation_count++; - topology_crat_parsed = 1; - - return 0; -} - #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) @@ -501,11 +213,17 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, return ret; } +static void kfd_topology_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + static const struct sysfs_ops sysprops_ops = { .show = sysprops_show, }; static struct kobj_type sysprops_type = { + .release = kfd_topology_kobj_release, .sysfs_ops = &sysprops_ops, }; @@ -541,6 +259,7 @@ static const struct sysfs_ops iolink_ops = { }; static struct kobj_type iolink_type = { + .release = kfd_topology_kobj_release, .sysfs_ops = &iolink_ops, }; @@ -568,6 +287,7 @@ static const struct sysfs_ops mem_ops = { }; static struct kobj_type mem_type = { + .release = kfd_topology_kobj_release, .sysfs_ops = &mem_ops, }; @@ -575,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { ssize_t ret; - uint32_t i; + uint32_t i, j; struct kfd_cache_properties *cache; /* Making sure that the buffer is an empty string */ @@ -593,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); sysfs_show_32bit_prop(buffer, "type", cache->cache_type); snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); - for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++) - ret = snprintf(buffer, PAGE_SIZE, "%s%d%s", - buffer, cache->sibling_map[i], - (i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ? - "\n" : ","); - + for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) + for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { + /* Check each bit */ + if (cache->sibling_map[i] & (1 << j)) + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 1, ","); + else + ret = snprintf(buffer, PAGE_SIZE, + "%s%d%s", buffer, 0, ","); + } + /* Replace the last "," with end of line */ + *(buffer + strlen(buffer) - 1) = 0xA; return ret; } @@ -607,9 +333,43 @@ static const struct sysfs_ops cache_ops = { }; static struct kobj_type cache_type = { + .release = kfd_topology_kobj_release, .sysfs_ops = &cache_ops, }; +/****** Sysfs of Performance Counters ******/ + +struct kfd_perf_attr { + struct kobj_attribute attr; + uint32_t data; +}; + +static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, + char *buf) +{ + struct kfd_perf_attr *attr; + + buf[0] = 0; + attr = container_of(attrs, struct kfd_perf_attr, attr); + if (!attr->data) /* invalid data for PMC */ + return 0; + else + return sysfs_show_32bit_val(buf, attr->data); +} + +#define KFD_PERF_DESC(_name, _data) \ +{ \ + .attr = __ATTR(_name, 0444, perf_show, NULL), \ + .data = _data, \ +} + +static struct kfd_perf_attr perf_attr_iommu[] = { + KFD_PERF_DESC(max_concurrent, 0), + KFD_PERF_DESC(num_counters, 0), + KFD_PERF_DESC(counter_ids, 0), +}; +/****************************************/ + static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -646,18 +406,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, "simd_count", dev->node_props.simd_count); - - if (dev->mem_bank_count < dev->node_props.mem_banks_count) { - pr_info_once("mem_banks_count truncated from %d to %d\n", - dev->node_props.mem_banks_count, - dev->mem_bank_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->mem_bank_count); - } else { - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); - } - + sysfs_show_32bit_prop(buffer, "mem_banks_count", + dev->node_props.mem_banks_count); sysfs_show_32bit_prop(buffer, "caches_count", dev->node_props.caches_count); sysfs_show_32bit_prop(buffer, "io_links_count", @@ -705,9 +455,12 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, HSA_CAP_WATCH_POINTS_TOTALBITS_MASK); } + if (dev->gpu->device_info->asic_family == CHIP_TONGA) + dev->node_props.capability |= + HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", - dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( - dev->gpu->kgd)); + dev->node_props.max_engine_clk_fcompute); sysfs_show_64bit_prop(buffer, "local_mem_size", (unsigned long long int) 0); @@ -729,6 +482,7 @@ static const struct sysfs_ops node_ops = { }; static struct kobj_type node_type = { + .release = kfd_topology_kobj_release, .sysfs_ops = &node_ops, }; @@ -744,6 +498,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; + struct kfd_perf_properties *perf; if (dev->kobj_iolink) { list_for_each_entry(iolink, &dev->io_link_props, list) @@ -780,6 +535,16 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) dev->kobj_mem = NULL; } + if (dev->kobj_perf) { + list_for_each_entry(perf, &dev->perf_props, list) { + kfree(perf->attr_group); + perf->attr_group = NULL; + } + kobject_del(dev->kobj_perf); + kobject_put(dev->kobj_perf); + dev->kobj_perf = NULL; + } + if (dev->kobj_node) { sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid); sysfs_remove_file(dev->kobj_node, &dev->attr_name); @@ -796,8 +561,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, struct kfd_iolink_properties *iolink; struct kfd_cache_properties *cache; struct kfd_mem_properties *mem; + struct kfd_perf_properties *perf; int ret; - uint32_t i; + uint32_t i, num_attrs; + struct attribute **attrs; if (WARN_ON(dev->kobj_node)) return -EEXIST; @@ -826,6 +593,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (!dev->kobj_iolink) return -ENOMEM; + dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node); + if (!dev->kobj_perf) + return -ENOMEM; + /* * Creating sysfs files for node properties */ @@ -903,11 +674,38 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, if (ret < 0) return ret; i++; -} + } + + /* All hardware blocks have the same number of attributes. */ + num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); + list_for_each_entry(perf, &dev->perf_props, list) { + perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) + * num_attrs + sizeof(struct attribute_group), + GFP_KERNEL); + if (!perf->attr_group) + return -ENOMEM; + + attrs = (struct attribute **)(perf->attr_group + 1); + if (!strcmp(perf->block_name, "iommu")) { + /* Information of IOMMU's num_counters and counter_ids is shown + * under /sys/bus/event_source/devices/amd_iommu. We don't + * duplicate here. + */ + perf_attr_iommu[0].data = perf->max_concurrent; + for (i = 0; i < num_attrs; i++) + attrs[i] = &perf_attr_iommu[i].attr.attr; + } + perf->attr_group->name = perf->block_name; + perf->attr_group->attrs = attrs; + ret = sysfs_create_group(dev->kobj_perf, perf->attr_group); + if (ret < 0) + return ret; + } return 0; } +/* Called with write topology lock acquired */ static int kfd_build_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -924,6 +722,7 @@ static int kfd_build_sysfs_node_tree(void) return 0; } +/* Called with write topology lock acquired */ static void kfd_remove_sysfs_node_tree(void) { struct kfd_topology_device *dev; @@ -995,75 +794,246 @@ static void kfd_topology_release_sysfs(void) } } +/* Called with write topology_lock acquired */ +static void kfd_topology_update_device_list(struct list_head *temp_list, + struct list_head *master_list) +{ + while (!list_empty(temp_list)) { + list_move_tail(temp_list->next, master_list); + sys_props.num_devices++; + } +} + +static void kfd_debug_print_topology(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + + dev = list_last_entry(&topology_device_list, + struct kfd_topology_device, list); + if (dev) { + if (dev->node_props.cpu_cores_count && + dev->node_props.simd_count) { + pr_info("Topology: Add APU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, + dev->node_props.vendor_id); + } else if (dev->node_props.cpu_cores_count) + pr_info("Topology: Add CPU node\n"); + else if (dev->node_props.simd_count) + pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n", + dev->node_props.device_id, + dev->node_props.vendor_id); + } + up_read(&topology_lock); +} + +/* Helper function for intializing platform_xx members of + * kfd_system_properties. Uses OEM info from the last CPU/APU node. + */ +static void kfd_update_system_properties(void) +{ + struct kfd_topology_device *dev; + + down_read(&topology_lock); + dev = list_last_entry(&topology_device_list, + struct kfd_topology_device, list); + if (dev) { + sys_props.platform_id = + (*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK; + sys_props.platform_oem = *((uint64_t *)dev->oem_table_id); + sys_props.platform_rev = dev->oem_revision; + } + up_read(&topology_lock); +} + +static void find_system_memory(const struct dmi_header *dm, + void *private) +{ + struct kfd_mem_properties *mem; + u16 mem_width, mem_clock; + struct kfd_topology_device *kdev = + (struct kfd_topology_device *)private; + const u8 *dmi_data = (const u8 *)(dm + 1); + + if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) { + mem_width = (u16)(*(const u16 *)(dmi_data + 0x6)); + mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11)); + list_for_each_entry(mem, &kdev->mem_props, list) { + if (mem_width != 0xFFFF && mem_width != 0) + mem->width = mem_width; + if (mem_clock != 0) + mem->mem_clk_max = mem_clock; + } + } +} + +/* + * Performance counters information is not part of CRAT but we would like to + * put them in the sysfs under topology directory for Thunk to get the data. + * This function is called before updating the sysfs. + */ +static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) +{ + struct kfd_perf_properties *props; + + if (amd_iommu_pc_supported()) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; + strcpy(props->block_name, "iommu"); + props->max_concurrent = amd_iommu_pc_get_max_banks(0) * + amd_iommu_pc_get_max_counters(0); /* assume one iommu */ + list_add_tail(&props->list, &kdev->perf_props); + } + + return 0; +} + +/* kfd_add_non_crat_information - Add information that is not currently + * defined in CRAT but is necessary for KFD topology + * @dev - topology device to which addition info is added + */ +static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) +{ + /* Check if CPU only node. */ + if (!kdev->gpu) { + /* Add system memory information */ + dmi_walk(find_system_memory, kdev); + } + /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ +} + +/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. + * Ignore CRAT for all other devices. AMD APU is identified if both CPU + * and GPU cores are present. + * @device_list - topology device list created by parsing ACPI CRAT table. + * @return - TRUE if invalid, FALSE is valid. + */ +static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) +{ + struct kfd_topology_device *dev; + + list_for_each_entry(dev, device_list, list) { + if (dev->node_props.cpu_cores_count && + dev->node_props.simd_count) + return false; + } + pr_info("Ignoring ACPI CRAT on non-APU system\n"); + return true; +} + int kfd_topology_init(void) { void *crat_image = NULL; size_t image_size = 0; int ret; - - /* - * Initialize the head for the topology device list + struct list_head temp_topology_device_list; + int cpu_only_node = 0; + struct kfd_topology_device *kdev; + int proximity_domain; + + /* topology_device_list - Master list of all topology devices + * temp_topology_device_list - temporary list created while parsing CRAT + * or VCRAT. Once parsing is complete the contents of list is moved to + * topology_device_list */ + + /* Initialize the head for the both the lists */ INIT_LIST_HEAD(&topology_device_list); + INIT_LIST_HEAD(&temp_topology_device_list); init_rwsem(&topology_lock); - topology_crat_parsed = 0; memset(&sys_props, 0, sizeof(sys_props)); + /* Proximity domains in ACPI CRAT tables start counting at + * 0. The same should be true for virtual CRAT tables created + * at this stage. GPUs added later in kfd_topology_add_device + * use a counter. + */ + proximity_domain = 0; + /* - * Get the CRAT image from the ACPI + * Get the CRAT image from the ACPI. If ACPI doesn't have one + * or if ACPI CRAT is invalid create a virtual CRAT. + * NOTE: The current implementation expects all AMD APUs to have + * CRAT. If no CRAT is available, it is assumed to be a CPU */ - ret = kfd_topology_get_crat_acpi(crat_image, &image_size); - if (ret == 0 && image_size > 0) { - pr_info("Found CRAT image with size=%zd\n", image_size); - crat_image = kmalloc(image_size, GFP_KERNEL); - if (!crat_image) { - ret = -ENOMEM; - pr_err("No memory for allocating CRAT image\n"); - goto err; + ret = kfd_create_crat_image_acpi(&crat_image, &image_size); + if (!ret) { + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret || + kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { + kfd_release_topology_device_list( + &temp_topology_device_list); + kfd_destroy_crat_image(crat_image); + crat_image = NULL; } - ret = kfd_topology_get_crat_acpi(crat_image, &image_size); - - if (ret == 0) { - down_write(&topology_lock); - ret = kfd_parse_crat_table(crat_image); - if (ret == 0) - ret = kfd_topology_update_sysfs(); - up_write(&topology_lock); - } else { - pr_err("Couldn't get CRAT table size from ACPI\n"); + } + + if (!crat_image) { + ret = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_CPU, NULL, + proximity_domain); + cpu_only_node = 1; + if (ret) { + pr_err("Error creating VCRAT table for CPU\n"); + return ret; } - kfree(crat_image); - } else if (ret == -ENODATA) { - ret = 0; - } else { - pr_err("Couldn't get CRAT table size from ACPI\n"); + + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret) { + pr_err("Error parsing VCRAT table for CPU\n"); + goto err; + } + } + + kdev = list_first_entry(&temp_topology_device_list, + struct kfd_topology_device, list); + kfd_add_perf_to_topology(kdev); + + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); + atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (!ret) { + sys_props.generation_count++; + kfd_update_system_properties(); + kfd_debug_print_topology(); + pr_info("Finished initializing topology\n"); + } else + pr_err("Failed to update topology in sysfs ret=%d\n", ret); + + /* For nodes with GPU, this information gets added + * when GPU is detected (kfd_topology_add_device). + */ + if (cpu_only_node) { + /* Add additional information to CPU only node created above */ + down_write(&topology_lock); + kdev = list_first_entry(&topology_device_list, + struct kfd_topology_device, list); + up_write(&topology_lock); + kfd_add_non_crat_information(kdev); } err: - pr_info("Finished initializing topology ret=%d\n", ret); + kfd_destroy_crat_image(crat_image); return ret; } void kfd_topology_shutdown(void) { + down_write(&topology_lock); kfd_topology_release_sysfs(); kfd_release_live_view(); -} - -static void kfd_debug_print_topology(void) -{ - struct kfd_topology_device *dev; - uint32_t i = 0; - - pr_info("DEBUG PRINT OF TOPOLOGY:"); - list_for_each_entry(dev, &topology_device_list, list) { - pr_info("Node: %d\n", i); - pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no")); - pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count); - pr_info("\tSIMD count: %d", dev->node_props.simd_count); - i++; - } + up_write(&topology_lock); } static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) @@ -1072,11 +1042,15 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) uint32_t buf[7]; uint64_t local_mem_size; int i; + struct kfd_local_mem_info local_mem_info; if (!gpu) return 0; - local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd); + gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info); + + local_mem_size = local_mem_info.local_mem_size_private + + local_mem_info.local_mem_size_public; buf[0] = gpu->pdev->devfn; buf[1] = gpu->pdev->subsystem_vendor; @@ -1091,19 +1065,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) return hashout; } - +/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If + * the GPU device is not already present in the topology device + * list then return NULL. This means a new topology device has to + * be created for this GPU. + * TODO: Rather than assiging @gpu to first topology device withtout + * gpu attached, it will better to have more stringent check. + */ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) { struct kfd_topology_device *dev; struct kfd_topology_device *out_dev = NULL; + down_write(&topology_lock); list_for_each_entry(dev, &topology_device_list, list) if (!dev->gpu && (dev->node_props.simd_count > 0)) { dev->gpu = gpu; out_dev = dev; break; } - + up_write(&topology_lock); return out_dev; } @@ -1115,84 +1096,196 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) */ } +/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, + * patch this after CRAT parsing. + */ +static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) +{ + struct kfd_mem_properties *mem; + struct kfd_local_mem_info local_mem_info; + + if (!dev) + return; + + /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with + * single bank of VRAM local memory. + * for dGPUs - VCRAT reports only one bank of Local Memory + * for APUs - If CRAT from ACPI reports more than one bank, then + * all the banks will report the same mem_clk_max information + */ + dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, + &local_mem_info); + + list_for_each_entry(mem, &dev->mem_props, list) + mem->mem_clk_max = local_mem_info.mem_clk_max; +} + +static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) +{ + struct kfd_iolink_properties *link; + + if (!dev || !dev->gpu) + return; + + /* GPU only creates direck links so apply flags setting to all */ + if (dev->gpu->device_info->asic_family == CHIP_HAWAII) + list_for_each_entry(link, &dev->io_link_props, list) + link->flags = CRAT_IOLINK_FLAGS_ENABLED | + CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | + CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; +} + int kfd_topology_add_device(struct kfd_dev *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - int res; + struct kfd_cu_info cu_info; + int res = 0; + struct list_head temp_topology_device_list; + void *crat_image = NULL; + size_t image_size = 0; + int proximity_domain; + + INIT_LIST_HEAD(&temp_topology_device_list); gpu_id = kfd_generate_gpu_id(gpu); pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - down_write(&topology_lock); - /* - * Try to assign the GPU to existing topology device (generated from - * CRAT table + proximity_domain = atomic_inc_return(&topology_crat_proximity_domain); + + /* Check to see if this gpu device exists in the topology_device_list. + * If so, assign the gpu to that device, + * else create a Virtual CRAT for this gpu device and then parse that + * CRAT to create a new topology device. Once created assign the gpu to + * that topology device */ dev = kfd_assign_gpu(gpu); if (!dev) { - pr_info("GPU was not found in the current topology. Extending.\n"); - kfd_debug_print_topology(); - dev = kfd_create_topology_device(); - if (!dev) { - res = -ENOMEM; + res = kfd_create_crat_image_virtual(&crat_image, &image_size, + COMPUTE_UNIT_GPU, gpu, + proximity_domain); + if (res) { + pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", + gpu_id); + return res; + } + res = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (res) { + pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", + gpu_id); goto err; } - dev->gpu = gpu; - /* - * TODO: Make a call to retrieve topology information from the - * GPU vBIOS - */ + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); /* Update the SYSFS tree, since we added another topology * device */ - if (kfd_topology_update_sysfs() < 0) - kfd_topology_release_sysfs(); - + res = kfd_topology_update_sysfs(); + up_write(&topology_lock); + + if (!res) + sys_props.generation_count++; + else + pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", + gpu_id, res); + dev = kfd_assign_gpu(gpu); + if (WARN_ON(!dev)) { + res = -ENODEV; + goto err; + } } dev->gpu_id = gpu_id; gpu->id = gpu_id; + + /* TODO: Move the following lines to function + * kfd_add_non_crat_information + */ + + /* Fill-in additional information that is not available in CRAT but + * needed for the topology + */ + + dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); + dev->node_props.simd_arrays_per_engine = + cu_info.num_shader_arrays_per_engine; + dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.device_id = gpu->pdev->device; - dev->node_props.location_id = (gpu->pdev->bus->number << 24) + - (gpu->pdev->devfn & 0xffffff); - /* - * TODO: Retrieve max engine clock values from KGD - */ + dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, + gpu->pdev->devfn); + dev->node_props.max_engine_clk_fcompute = + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); + dev->node_props.max_engine_clk_ccompute = + cpufreq_quick_get_max(0) / 1000; + + kfd_fill_mem_clk_max_info(dev); + kfd_fill_iolink_non_crat_info(dev); + + switch (dev->gpu->device_info->asic_family) { + case CHIP_KAVERI: + case CHIP_HAWAII: + case CHIP_TONGA: + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + case CHIP_CARRIZO: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + pr_debug("Adding doorbell packet type capability\n"); + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->gpu->device_info->asic_family); + } + /* Fix errors in CZ CRAT. + * simd_count: Carrizo CRAT reports wrong simd_count, probably + * because it doesn't consider masked out CUs + * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd + * capability flag: Carrizo CRAT doesn't report IOMMU flags + */ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { - dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; - pr_info("Adding doorbell packet type capability\n"); + dev->node_props.simd_count = + cu_info.simd_per_cu * cu_info.cu_active_number; + dev->node_props.max_waves_per_simd = 10; + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; } - res = 0; - -err: - up_write(&topology_lock); + kfd_debug_print_topology(); - if (res == 0) + if (!res) kfd_notify_gpu_change(gpu_id, 1); - +err: + kfd_destroy_crat_image(crat_image); return res; } int kfd_topology_remove_device(struct kfd_dev *gpu) { - struct kfd_topology_device *dev; + struct kfd_topology_device *dev, *tmp; uint32_t gpu_id; int res = -ENODEV; down_write(&topology_lock); - list_for_each_entry(dev, &topology_device_list, list) + list_for_each_entry_safe(dev, tmp, &topology_device_list, list) if (dev->gpu == gpu) { gpu_id = dev->gpu_id; kfd_remove_sysfs_node_entry(dev); kfd_release_topology_device(dev); + sys_props.num_devices--; res = 0; if (kfd_topology_update_sysfs() < 0) kfd_topology_release_sysfs(); @@ -1201,28 +1294,32 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) up_write(&topology_lock); - if (res == 0) + if (!res) kfd_notify_gpu_change(gpu_id, 0); return res; } -/* - * When idx is out of bounds, the function will return NULL +/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD + * topology. If GPU device is found @idx, then valid kfd_dev pointer is + * returned through @kdev + * Return - 0: On success (@kdev will be NULL for non GPU nodes) + * -1: If end of list */ -struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) +int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) { struct kfd_topology_device *top_dev; - struct kfd_dev *device = NULL; uint8_t device_idx = 0; + *kdev = NULL; down_read(&topology_lock); list_for_each_entry(top_dev, &topology_device_list, list) { if (device_idx == idx) { - device = top_dev->gpu; - break; + *kdev = top_dev->gpu; + up_read(&topology_lock); + return 0; } device_idx++; @@ -1230,6 +1327,88 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx) up_read(&topology_lock); - return device; + return -1; + +} + +static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) +{ + const struct cpuinfo_x86 *cpuinfo; + int first_cpu_of_numa_node; + + if (!cpumask || cpumask == cpu_none_mask) + return -1; + first_cpu_of_numa_node = cpumask_first(cpumask); + if (first_cpu_of_numa_node >= nr_cpu_ids) + return -1; + cpuinfo = &cpu_data(first_cpu_of_numa_node); + return cpuinfo->apicid; } + +/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor + * of the given NUMA node (numa_node_id) + * Return -1 on failure + */ +int kfd_numa_node_to_apic_id(int numa_node_id) +{ + if (numa_node_id == -1) { + pr_warn("Invalid NUMA Node. Use online CPU mask\n"); + return kfd_cpumask_to_apic_id(cpu_online_mask); + } + return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); +} + +#if defined(CONFIG_DEBUG_FS) + +int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) +{ + struct kfd_topology_device *dev; + unsigned int i = 0; + int r = 0; + + down_read(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) { + if (!dev->gpu) { + i++; + continue; + } + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = dqm_debugfs_hqds(m, dev->gpu->dqm); + if (r) + break; + } + + up_read(&topology_lock); + + return r; +} + +int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) +{ + struct kfd_topology_device *dev; + unsigned int i = 0; + int r = 0; + + down_read(&topology_lock); + + list_for_each_entry(dev, &topology_device_list, list) { + if (!dev->gpu) { + i++; + continue; + } + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); + if (r) + break; + } + + up_read(&topology_lock); + + return r; +} + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index c3ddb9b95ff8..53fca1f45401 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -39,8 +39,13 @@ #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 -#define HSA_CAP_RESERVED 0xfffff000 -#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 +#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 +#define HSA_CAP_RESERVED 0xffffc000 + +#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 +#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 +#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { uint32_t cpu_cores_count; @@ -91,8 +96,6 @@ struct kfd_mem_properties { struct attribute attr; }; -#define KFD_TOPOLOGY_CPU_SIBLINGS 256 - #define HSA_CACHE_TYPE_DATA 0x00000001 #define HSA_CACHE_TYPE_INSTRUCTION 0x00000002 #define HSA_CACHE_TYPE_CPU 0x00000004 @@ -109,7 +112,7 @@ struct kfd_cache_properties { uint32_t cache_assoc; uint32_t cache_latency; uint32_t cache_type; - uint8_t sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS]; + uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; struct kobject *kobj; struct attribute attr; }; @@ -132,24 +135,36 @@ struct kfd_iolink_properties { struct attribute attr; }; +struct kfd_perf_properties { + struct list_head list; + char block_name[16]; + uint32_t max_concurrent; + struct attribute_group *attr_group; +}; + struct kfd_topology_device { struct list_head list; uint32_t gpu_id; + uint32_t proximity_domain; struct kfd_node_properties node_props; - uint32_t mem_bank_count; struct list_head mem_props; uint32_t cache_count; struct list_head cache_props; uint32_t io_link_count; struct list_head io_link_props; + struct list_head perf_props; struct kfd_dev *gpu; struct kobject *kobj_node; struct kobject *kobj_mem; struct kobject *kobj_cache; struct kobject *kobj_iolink; + struct kobject *kobj_perf; struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; }; struct kfd_system_properties { @@ -164,6 +179,12 @@ struct kfd_system_properties { struct attribute attr_props; }; +struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list); +void kfd_release_topology_device_list(struct list_head *device_list); +extern bool amd_iommu_pc_supported(void); +extern u8 amd_iommu_pc_get_max_banks(u16 devid); +extern u8 amd_iommu_pc_get_max_counters(u16 devid); #endif /* __KFD_TOPOLOGY_H__ */ |